diff options
14 files changed, 28 insertions, 96696 deletions
diff --git a/sys-kernel/linux-image-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch deleted file mode 100644 index db7d064b..00000000 --- a/sys-kernel/linux-image-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch +++ /dev/null @@ -1,25199 +0,0 @@ -From c21f53f17430230dab50df29b8ea1b71f99d09d6 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@unimore.it> -Date: Tue, 7 Apr 2015 13:39:12 +0200 -Subject: [PATCH 01/51] Add BFQ-v8r12 - -This commit is the result of the following operations. - -1. The squash of all the commits between "block: cgroups, kconfig, -build bits for BFQ-v7r11-4.5.0" and BFQ-v8r12 in the branch -bfq-mq-v8-v4.11 - -2. The renaming of two files (block/bfq-cgroup.c -> -block/bfq-cgroup-included.c and block/bfq-iosched.c -> -block/bfq-sq-iosched.c) and of one option (CONFIG_BFQ_GROUP_IOSCHED -> -CONFIG_BFQ_SQ_GROUP_IOSCHED), to avoid name clashes. These name -clashes are due to the presence of bfq in mainline from 4.12. - -3. The modification of block/Makefile and block/Kconfig.iosched to -comply with the above renaming. - -Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it> -Signed-off-by: Arianna Avanzini <avanzini@google.com> -Signed-off-by: Linus Walleij <linus.walleij@linaro.org> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Makefile | 2 +- - block/Kconfig.iosched | 31 + - block/bfq-cgroup-included.c | 1190 ++++++++++ - block/bfq-ioc.c | 36 + - block/bfq-sched.c | 2002 ++++++++++++++++ - block/bfq-sq-iosched.c | 5379 +++++++++++++++++++++++++++++++++++++++++++ - block/bfq.h | 948 ++++++++ - include/linux/blkdev.h | 2 +- - 9 files changed, 9589 insertions(+), 2 deletions(-) - create mode 100644 block/bfq-cgroup-included.c - create mode 100644 block/bfq-ioc.c - create mode 100644 block/bfq-sched.c - create mode 100644 block/bfq-sq-iosched.c - create mode 100644 block/bfq.h - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index a4a8914bf7a4..9e3f4c2f7390 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED - ---help--- - Enable group IO scheduling in CFQ. - -+config IOSCHED_BFQ_SQ -+ tristate "BFQ-SQ I/O scheduler" -+ default n -+ ---help--- -+ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for -+ SingleQueue) distributes bandwidth among all processes -+ according to their weights, regardless of the device -+ parameters and with any workload. It also guarantees a low -+ latency to interactive and soft real-time applications. -+ Details in Documentation/block/bfq-iosched.txt -+ -+config BFQ_SQ_GROUP_IOSCHED -+ bool "BFQ-SQ hierarchical scheduling support" -+ depends on IOSCHED_BFQ_SQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-SQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - choice - - prompt "Default I/O scheduler" -@@ -54,6 +74,16 @@ choice - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - -+ config DEFAULT_BFQ_SQ -+ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y -+ help -+ Selects BFQ-SQ as the default I/O scheduler which will be -+ used by default for all block devices. -+ The BFQ-SQ I/O scheduler aims at distributing the bandwidth -+ as desired, independently of the disk parameters and with -+ any workload. It also tries to guarantee low latency to -+ interactive and soft real-time applications. -+ - config DEFAULT_NOOP - bool "No-op" - -@@ -63,6 +93,7 @@ config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ -+ default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - - config MQ_IOSCHED_DEADLINE -diff --git a/block/Makefile b/block/Makefile -index 6a56303b9925..59026b425791 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -24,6 +24,7 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o - obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -+obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -new file mode 100644 -index 000000000000..af7c216a3540 ---- /dev/null -+++ b/block/bfq-cgroup-included.c -@@ -0,0 +1,1190 @@ -+/* -+ * BFQ: CGROUPS support. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ */ -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ -+/* bfqg stats flags */ -+enum bfqg_stats_flags { -+ BFQG_stats_waiting = 0, -+ BFQG_stats_idling, -+ BFQG_stats_empty, -+}; -+ -+#define BFQG_FLAG_FNS(name) \ -+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags |= (1 << BFQG_stats_##name); \ -+} \ -+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags &= ~(1 << BFQG_stats_##name); \ -+} \ -+static int bfqg_stats_##name(struct bfqg_stats *stats) \ -+{ \ -+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -+} \ -+ -+BFQG_FLAG_FNS(waiting) -+BFQG_FLAG_FNS(idling) -+BFQG_FLAG_FNS(empty) -+#undef BFQG_FLAG_FNS -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_waiting(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_group_wait_time)) -+ blkg_stat_add(&stats->group_wait_time, -+ now - stats->start_group_wait_time); -+ bfqg_stats_clear_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_waiting(stats)) -+ return; -+ if (bfqg == curr_bfqg) -+ return; -+ stats->start_group_wait_time = sched_clock(); -+ bfqg_stats_mark_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_empty(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_empty_time)) -+ blkg_stat_add(&stats->empty_time, -+ now - stats->start_empty_time); -+ bfqg_stats_clear_empty(stats); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -+{ -+ blkg_stat_add(&bfqg->stats.dequeue, 1); -+} -+ -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (blkg_rwstat_total(&stats->queued)) -+ return; -+ -+ /* -+ * group is already marked empty. This can happen if bfqq got new -+ * request in parent group and moved to this group while being added -+ * to service tree. Just ignore the event and move on. -+ */ -+ if (bfqg_stats_empty(stats)) -+ return; -+ -+ stats->start_empty_time = sched_clock(); -+ bfqg_stats_mark_empty(stats); -+} -+ -+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_idling(stats)) { -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, stats->start_idle_time)) -+ blkg_stat_add(&stats->idle_time, -+ now - stats->start_idle_time); -+ bfqg_stats_clear_idling(stats); -+ } -+} -+ -+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ stats->start_idle_time = sched_clock(); -+ bfqg_stats_mark_idling(stats); -+} -+ -+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ blkg_stat_add(&stats->avg_queue_size_sum, -+ blkg_rwstat_total(&stats->queued)); -+ blkg_stat_add(&stats->avg_queue_size_samples, 1); -+ bfqg_stats_update_group_wait_time(stats); -+} -+ -+static struct blkcg_policy blkcg_policy_bfq; -+ -+/* -+ * blk-cgroup policy-related handlers -+ * The following functions help in converting between blk-cgroup -+ * internal structures and BFQ-specific structures. -+ */ -+ -+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -+{ -+ return pd ? container_of(pd, struct bfq_group, pd) : NULL; -+} -+ -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -+{ -+ return pd_to_blkg(&bfqg->pd); -+} -+ -+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -+{ -+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); -+ -+ return pd_to_bfqg(pd); -+} -+ -+/* -+ * bfq_group handlers -+ * The following functions help in navigating the bfq_group hierarchy -+ * by allowing to find the parent of a bfq_group or the bfq_group -+ * associated to a bfq_queue. -+ */ -+ -+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -+{ -+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; -+ -+ return pblkg ? blkg_to_bfqg(pblkg) : NULL; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ return group_entity ? container_of(group_entity, struct bfq_group, -+ entity) : -+ bfqq->bfqd->root_group; -+} -+ -+/* -+ * The following two functions handle get and put of a bfq_group by -+ * wrapping the related blk-cgroup hooks. -+ */ -+ -+static void bfqg_get(struct bfq_group *bfqg) -+{ -+ return blkg_get(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_put(struct bfq_group *bfqg) -+{ -+ return blkg_put(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, -+ unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time - start_time); -+} -+ -+/* @stats = 0 */ -+static void bfqg_stats_reset(struct bfqg_stats *stats) -+{ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_reset(&stats->merged); -+ blkg_rwstat_reset(&stats->service_time); -+ blkg_rwstat_reset(&stats->wait_time); -+ blkg_stat_reset(&stats->time); -+ blkg_stat_reset(&stats->avg_queue_size_sum); -+ blkg_stat_reset(&stats->avg_queue_size_samples); -+ blkg_stat_reset(&stats->dequeue); -+ blkg_stat_reset(&stats->group_wait_time); -+ blkg_stat_reset(&stats->idle_time); -+ blkg_stat_reset(&stats->empty_time); -+} -+ -+/* @to += @from */ -+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) -+{ -+ if (!to || !from) -+ return; -+ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_add_aux(&to->merged, &from->merged); -+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); -+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); -+ blkg_stat_add_aux(&from->time, &from->time); -+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); -+ blkg_stat_add_aux(&to->avg_queue_size_samples, -+ &from->avg_queue_size_samples); -+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); -+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); -+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); -+ blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+} -+ -+/* -+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' -+ * recursive stats can still account for the amount used by this bfqg after -+ * it's gone. -+ */ -+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -+{ -+ struct bfq_group *parent; -+ -+ if (!bfqg) /* root_group */ -+ return; -+ -+ parent = bfqg_parent(bfqg); -+ -+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); -+ -+ if (unlikely(!parent)) -+ return; -+ -+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ bfqg_get(bfqg); -+ } -+ entity->parent = bfqg->my_entity; /* NULL for root group */ -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfqg_stats_exit(struct bfqg_stats *stats) -+{ -+ blkg_rwstat_exit(&stats->merged); -+ blkg_rwstat_exit(&stats->service_time); -+ blkg_rwstat_exit(&stats->wait_time); -+ blkg_rwstat_exit(&stats->queued); -+ blkg_stat_exit(&stats->time); -+ blkg_stat_exit(&stats->avg_queue_size_sum); -+ blkg_stat_exit(&stats->avg_queue_size_samples); -+ blkg_stat_exit(&stats->dequeue); -+ blkg_stat_exit(&stats->group_wait_time); -+ blkg_stat_exit(&stats->idle_time); -+ blkg_stat_exit(&stats->empty_time); -+} -+ -+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -+{ -+ if (blkg_rwstat_init(&stats->merged, gfp) || -+ blkg_rwstat_init(&stats->service_time, gfp) || -+ blkg_rwstat_init(&stats->wait_time, gfp) || -+ blkg_rwstat_init(&stats->queued, gfp) || -+ blkg_stat_init(&stats->time, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || -+ blkg_stat_init(&stats->dequeue, gfp) || -+ blkg_stat_init(&stats->group_wait_time, gfp) || -+ blkg_stat_init(&stats->idle_time, gfp) || -+ blkg_stat_init(&stats->empty_time, gfp)) { -+ bfqg_stats_exit(stats); -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -+{ -+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -+} -+ -+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -+{ -+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -+} -+ -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), gfp); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ -+static void bfq_cpd_init(struct blkcg_policy_data *cpd) -+{ -+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); -+ -+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? -+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); -+} -+ -+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -+{ -+ struct bfq_group *bfqg; -+ -+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); -+ if (!bfqg) -+ return NULL; -+ -+ if (bfqg_stats_init(&bfqg->stats, gfp)) { -+ kfree(bfqg); -+ return NULL; -+ } -+ -+ return &bfqg->pd; -+} -+ -+static void bfq_pd_init(struct blkg_policy_data *pd) -+{ -+ struct blkcg_gq *blkg; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ struct bfq_group_data *d; -+ -+ blkg = pd_to_blkg(pd); -+ BUG_ON(!blkg); -+ bfqg = blkg_to_bfqg(blkg); -+ bfqd = blkg->q->elevator->elevator_data; -+ entity = &bfqg->entity; -+ d = blkcg_to_bfqgd(blkg->blkcg); -+ -+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; -+ entity->my_sched_data = &bfqg->sched_data; -+ bfqg->my_entity = entity; /* -+ * the root_group's will be set to NULL -+ * in bfq_init_queue() -+ */ -+ bfqg->bfqd = bfqd; -+ bfqg->active_entities = 0; -+ bfqg->rq_pos_tree = RB_ROOT; -+} -+ -+static void bfq_pd_free(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_exit(&bfqg->stats); -+ return kfree(bfqg); -+} -+ -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) -+{ -+ struct bfq_entity *entity; -+ -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; -+} -+ -+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct blkcg_gq *blkg; -+ -+ blkg = blkg_lookup(blkcg, bfqd->queue); -+ if (likely(blkg)) -+ return blkg_to_bfqg(blkg); -+ return NULL; -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct bfq_group *bfqg, *parent; -+ struct bfq_entity *entity; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_lookup_bfqg(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ return NULL; -+ -+ /* -+ * Update chain of bfq_groups as we might be handling a leaf group -+ * which, along with some of its relatives, has not been hooked yet -+ * to the private hierarchy of BFQ. -+ */ -+ entity = &bfqg->entity; -+ for_each_entity(entity) { -+ bfqg = container_of(entity, struct bfq_group, entity); -+ BUG_ON(!bfqg); -+ if (bfqg != bfqd->root_group) { -+ parent = bfqg_parent(bfqg); -+ if (!parent) -+ parent = bfqd->root_group; -+ BUG_ON(!parent); -+ bfq_group_set_parent(bfqg, parent); -+ } -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/** -+ * bfq_bfqq_move - migrate @bfqq to @bfqg. -+ * @bfqd: queue descriptor. -+ * @bfqq: the queue to move. -+ * @bfqg: the group to move to. -+ * -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -+ * it on the new one. Avoid putting the entity on the old group idle tree. -+ * -+ * Must be called under the queue lock; the cgroup owning @bfqg must -+ * not disappear (by now this just means that we are called under -+ * rcu_read_lock()). -+ */ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); -+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) -+ && entity->on_st && -+ bfqq != bfqd->in_service_queue); -+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); -+ -+ /* If bfqq is empty, then bfq_bfqq_expire also invokes -+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity -+ * from data structures related to current group. Otherwise we -+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as -+ * we do below. -+ */ -+ if (bfqq == bfqd->in_service_queue) -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_busy(bfqq)) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ else if (entity->on_st) { -+ BUG_ON(&bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ } -+ bfqg_put(bfqq_group(bfqq)); -+ -+ /* -+ * Here we use a reference to bfqg. We don't need a refcounter -+ * as the cgroup reference will not be dropped, so that its -+ * destroy() callback will not be invoked. -+ */ -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+ bfqg_get(bfqg); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ if (bfq_bfqq_busy(bfqq)) { -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+ -+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+} -+ -+/** -+ * __bfq_bic_change_cgroup - move @bic to @cgroup. -+ * @bfqd: the queue descriptor. -+ * @bic: the bic to move. -+ * @blkcg: the blk-cgroup to move to. -+ * -+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller -+ * has to make sure that the reference to cgroup is valid across the call. -+ * -+ * NOTE: an alternative approach might have been to store the current -+ * cgroup in bfqq and getting a reference to it, reducing the lookup -+ * time here, at the price of slightly more complex code. -+ */ -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct blkcg *blkcg) -+{ -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); -+ struct bfq_group *bfqg; -+ struct bfq_entity *entity; -+ -+ lockdep_assert_held(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_find_set_group(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ bfqg = bfqd->root_group; -+ -+ if (async_bfqq) { -+ entity = &async_bfqq->entity; -+ -+ if (entity->sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, 0); -+ bfq_log_bfqq(bfqd, async_bfqq, -+ "bic_change_group: %p %d", -+ async_bfqq, -+ async_bfqq->ref); -+ bfq_put_queue(async_bfqq); -+ } -+ } -+ -+ if (sync_bfqq) { -+ entity = &sync_bfqq->entity; -+ if (entity->sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_group *bfqg = NULL; -+ uint64_t serial_nr; -+ -+ rcu_read_lock(); -+ serial_nr = bio_blkcg(bio)->css.serial_nr; -+ -+ /* -+ * Check whether blkcg has changed. The condition may trigger -+ * spuriously on a newly created cic but there's no harm. -+ */ -+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) -+ goto out; -+ -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+ bic->blkcg_serial_nr = serial_nr; -+out: -+ rcu_read_unlock(); -+} -+ -+/** -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. -+ * @st: the service tree being flushed. -+ */ -+static void bfq_flush_idle_tree(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entity = st->first_idle; -+ -+ for (; entity ; entity = st->first_idle) -+ __bfq_deactivate_entity(entity, false); -+} -+ -+/** -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. -+ * @bfqd: the device data structure with the root group. -+ * @entity: the entity to move. -+ */ -+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(!bfqq); -+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); -+} -+ -+/** -+ * bfq_reparent_active_entities - move to the root group all active -+ * entities. -+ * @bfqd: the device data structure with the root group. -+ * @bfqg: the group to move from. -+ * @st: the service tree with the entities. -+ * -+ * Needs queue_lock to be taken and reference to be valid over the call. -+ */ -+static void bfq_reparent_active_entities(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ struct bfq_service_tree *st) -+{ -+ struct rb_root *active = &st->active; -+ struct bfq_entity *entity = NULL; -+ -+ if (!RB_EMPTY_ROOT(&st->active)) -+ entity = bfq_entity_of(rb_first(active)); -+ -+ for (; entity ; entity = bfq_entity_of(rb_first(active))) -+ bfq_reparent_leaf_entity(bfqd, entity); -+ -+ if (bfqg->sched_data.in_service_entity) -+ bfq_reparent_leaf_entity(bfqd, -+ bfqg->sched_data.in_service_entity); -+} -+ -+/** -+ * bfq_pd_offline - deactivate the entity associated with @pd, -+ * and reparent its children entities. -+ * @pd: descriptor of the policy going offline. -+ * -+ * blkio already grabs the queue_lock for us, so no need to use -+ * RCU-based magic -+ */ -+static void bfq_pd_offline(struct blkg_policy_data *pd) -+{ -+ struct bfq_service_tree *st; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ int i; -+ -+ BUG_ON(!pd); -+ bfqg = pd_to_bfqg(pd); -+ BUG_ON(!bfqg); -+ bfqd = bfqg->bfqd; -+ BUG_ON(bfqd && !bfqd->root_group); -+ -+ entity = bfqg->my_entity; -+ -+ if (!entity) /* root group */ -+ return; -+ -+ /* -+ * Empty all service_trees belonging to this group before -+ * deactivating the group itself. -+ */ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ BUG_ON(!bfqg->sched_data.service_tree); -+ st = bfqg->sched_data.service_tree + i; -+ /* -+ * The idle tree may still contain bfq_queues belonging -+ * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. No one else -+ * can access them so it's safe to act without any lock. -+ */ -+ bfq_flush_idle_tree(st); -+ -+ /* -+ * It may happen that some queues are still active -+ * (busy) upon group destruction (if the corresponding -+ * processes have been forced to terminate). We move -+ * all the leaf entities corresponding to these queues -+ * to the root_group. -+ * Also, it may happen that the group has an entity -+ * in service, which is disconnected from the active -+ * tree: it must be moved, too. -+ * There is no need to put the sync queues, as the -+ * scheduler has taken no reference. -+ */ -+ bfq_reparent_active_entities(bfqd, bfqg, st); -+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); -+ } -+ BUG_ON(bfqg->sched_data.next_in_service); -+ BUG_ON(bfqg->sched_data.in_service_entity); -+ -+ __bfq_deactivate_entity(entity, false); -+ bfq_put_async_queues(bfqd, bfqg); -+ -+ /* -+ * @blkg is going offline and will be ignored by -+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -+ * that they don't get lost. If IOs complete after this point, the -+ * stats for them will be lost. Oh well... -+ */ -+ bfqg_stats_xfer_dead(bfqg); -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ struct blkcg_gq *blkg; -+ -+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ BUG_ON(!bfqg); -+ -+ bfq_end_wr_async_queues(bfqd, bfqg); -+ } -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static int bfq_io_show_weight(struct seq_file *sf, void *v) -+{ -+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ unsigned int val = 0; -+ -+ if (bfqgd) -+ val = bfqgd->weight; -+ -+ seq_printf(sf, "%u\n", val); -+ -+ return 0; -+} -+ -+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ struct blkcg_gq *blkg; -+ int ret = -ERANGE; -+ -+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) -+ return ret; -+ -+ ret = 0; -+ spin_lock_irq(&blkcg->lock); -+ bfqgd->weight = (unsigned short)val; -+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ if (!bfqg) -+ continue; -+ /* -+ * Setting the prio_changed flag of the entity -+ * to 1 with new_weight == weight would re-set -+ * the value of the weight to its ioprio mapping. -+ * Set the flag only if necessary. -+ */ -+ if ((unsigned short)val != bfqg->entity.new_weight) { -+ bfqg->entity.new_weight = (unsigned short)val; -+ /* -+ * Make sure that the above new value has been -+ * stored in bfqg->entity.new_weight before -+ * setting the prio_changed flag. In fact, -+ * this flag may be read asynchronously (in -+ * critical sections protected by a different -+ * lock than that held here), and finding this -+ * flag set may cause the execution of the code -+ * for updating parameters whose value may -+ * depend also on bfqg->entity.new_weight (in -+ * __bfq_entity_update_weight_prio). -+ * This barrier makes sure that the new value -+ * of bfqg->entity.new_weight is correctly -+ * seen in that code. -+ */ -+ smp_wmb(); -+ bfqg->entity.prio_changed = 1; -+ } -+ } -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) -+{ -+ u64 weight; -+ /* First unsigned long found in the file is used */ -+ int ret = kstrtoull(strim(buf), 0, &weight); -+ -+ if (ret) -+ return ret; -+ -+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); -+} -+ -+static int bfqg_print_stat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, off); -+ return __blkg_prfill_u64(sf, pd, sum); -+} -+ -+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, -+ off); -+ return __blkg_prfill_rwstat(sf, pd, &sum); -+} -+ -+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, -+ int off) -+{ -+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, -+ offsetof(struct blkcg_gq, stat_bytes)); -+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + -+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, -+ false); -+ return 0; -+} -+ -+ -+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); -+ u64 v = 0; -+ -+ if (samples) { -+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); -+ v = div64_u64(v, samples); -+ } -+ __blkg_prfill_u64(sf, pd, v); -+ return 0; -+} -+ -+/* print avg_queue_size */ -+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, -+ 0, false); -+ return 0; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ int ret; -+ -+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); -+ if (ret) -+ return NULL; -+ -+ return blkg_to_bfqg(bfqd->queue->root_blkg); -+} -+ -+static struct cftype bfq_blkcg_legacy_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write_u64 = bfq_io_set_weight_legacy, -+ }, -+ -+ /* statistics, covers only the tasks in the bfqg */ -+ { -+ .name = "bfq.time", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.sectors", -+ .seq_show = bfqg_print_stat_sectors, -+ }, -+ { -+ .name = "bfq.io_service_bytes", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes, -+ }, -+ { -+ .name = "bfq.io_serviced", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios, -+ }, -+ { -+ .name = "bfq.io_service_time", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_wait_time", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_merged", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_queued", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ -+ /* the same statictics which cover the bfqg and its descendants */ -+ { -+ .name = "bfq.time_recursive", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = "bfq.sectors_recursive", -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, -+ { -+ .name = "bfq.io_service_bytes_recursive", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes_recursive, -+ }, -+ { -+ .name = "bfq.io_serviced_recursive", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios_recursive, -+ }, -+ { -+ .name = "bfq.io_service_time_recursive", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_wait_time_recursive", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_merged_recursive", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_queued_recursive", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.avg_queue_size", -+ .seq_show = bfqg_print_avg_queue_size, -+ }, -+ { -+ .name = "bfq.group_wait_time", -+ .private = offsetof(struct bfq_group, stats.group_wait_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.idle_time", -+ .private = offsetof(struct bfq_group, stats.idle_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.empty_time", -+ .private = offsetof(struct bfq_group, stats.empty_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.dequeue", -+ .private = offsetof(struct bfq_group, stats.dequeue), -+ .seq_show = bfqg_print_stat, -+ }, -+ { } /* terminate */ -+}; -+ -+static struct cftype bfq_blkg_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write = bfq_io_set_weight, -+ }, -+ {} /* terminate */ -+}; -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) {} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ } -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ return bfqd->root_group; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ struct bfq_group *bfqg; -+ int i; -+ -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); -+ if (!bfqg) -+ return NULL; -+ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ -+ return bfqg; -+} -+#endif -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c -new file mode 100644 -index 000000000000..fb7bb8f08b75 ---- /dev/null -+++ b/block/bfq-ioc.c -@@ -0,0 +1,36 @@ -+/* -+ * BFQ: I/O context handling. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> -+ */ -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * -+ * Queue lock must be held. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc) -+{ -+ if (ioc) -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); -+ return NULL; -+} -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -new file mode 100644 -index 000000000000..ac8991bca9fa ---- /dev/null -+++ b/block/bfq-sched.c -@@ -0,0 +1,2002 @@ -+/* -+ * BFQ: Hierarchical B-WF2Q+ scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -+{ -+ struct rb_node *node = tree->rb_node; -+ -+ return rb_entry(node, struct bfq_entity, rb_node); -+} -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); -+ -+/** -+ * bfq_update_next_in_service - update sd->next_in_service -+ * @sd: sched_data for which to perform the update. -+ * @new_entity: if not NULL, pointer to the entity whose activation, -+ * requeueing or repositionig triggered the invocation of -+ * this function. -+ * -+ * This function is called to update sd->next_in_service, which, in -+ * its turn, may change as a consequence of the insertion or -+ * extraction of an entity into/from one of the active trees of -+ * sd. These insertions/extractions occur as a consequence of -+ * activations/deactivations of entities, with some activations being -+ * 'true' activations, and other activations being requeueings (i.e., -+ * implementing the second, requeueing phase of the mechanism used to -+ * reposition an entity in its active tree; see comments on -+ * __bfq_activate_entity and __bfq_requeue_entity for details). In -+ * both the last two activation sub-cases, new_entity points to the -+ * just activated or requeued entity. -+ * -+ * Returns true if sd->next_in_service changes in such a way that -+ * entity->parent may become the next_in_service for its parent -+ * entity. -+ */ -+static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *new_entity) -+{ -+ struct bfq_entity *next_in_service = sd->next_in_service; -+ struct bfq_queue *bfqq; -+ bool parent_sched_may_change = false; -+ -+ /* -+ * If this update is triggered by the activation, requeueing -+ * or repositiong of an entity that does not coincide with -+ * sd->next_in_service, then a full lookup in the active tree -+ * can be avoided. In fact, it is enough to check whether the -+ * just-modified entity has a higher priority than -+ * sd->next_in_service, or, even if it has the same priority -+ * as sd->next_in_service, is eligible and has a lower virtual -+ * finish time than sd->next_in_service. If this compound -+ * condition holds, then the new entity becomes the new -+ * next_in_service. Otherwise no change is needed. -+ */ -+ if (new_entity && new_entity != sd->next_in_service) { -+ /* -+ * Flag used to decide whether to replace -+ * sd->next_in_service with new_entity. Tentatively -+ * set to true, and left as true if -+ * sd->next_in_service is NULL. -+ */ -+ bool replace_next = true; -+ -+ /* -+ * If there is already a next_in_service candidate -+ * entity, then compare class priorities or timestamps -+ * to decide whether to replace sd->service_tree with -+ * new_entity. -+ */ -+ if (next_in_service) { -+ unsigned int new_entity_class_idx = -+ bfq_class_idx(new_entity); -+ struct bfq_service_tree *st = -+ sd->service_tree + new_entity_class_idx; -+ -+ /* -+ * For efficiency, evaluate the most likely -+ * sub-condition first. -+ */ -+ replace_next = -+ (new_entity_class_idx == -+ bfq_class_idx(next_in_service) -+ && -+ !bfq_gt(new_entity->start, st->vtime) -+ && -+ bfq_gt(next_in_service->finish, -+ new_entity->finish)) -+ || -+ new_entity_class_idx < -+ bfq_class_idx(next_in_service); -+ } -+ -+ if (replace_next) -+ next_in_service = new_entity; -+ } else /* invoked because of a deactivation: lookup needed */ -+ next_in_service = bfq_lookup_next_entity(sd); -+ -+ if (next_in_service) { -+ parent_sched_may_change = !sd->next_in_service || -+ bfq_update_parent_budget(next_in_service); -+ } -+ -+ sd->next_in_service = next_in_service; -+ -+ if (!next_in_service) -+ return parent_sched_may_change; -+ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chosen this queue"); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_next_in_service: chosen this entity"); -+ } -+#endif -+ return parent_sched_may_change; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* both next loops stop at one of the child entities of the root group */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+/* -+ * For each iteration, compute parent in advance, so as to be safe if -+ * entity is deallocated during the iteration. Such a deallocation may -+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue -+ * containing entity. -+ */ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+/* -+ * Returns true if this budget changes may let next_in_service->parent -+ * become the next_in_service entity for its parent entity. -+ */ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ struct bfq_entity *bfqg_entity; -+ struct bfq_group *bfqg; -+ struct bfq_sched_data *group_sd; -+ bool ret = false; -+ -+ BUG_ON(!next_in_service); -+ -+ group_sd = next_in_service->sched_data; -+ -+ bfqg = container_of(group_sd, struct bfq_group, sched_data); -+ /* -+ * bfq_group's my_entity field is not NULL only if the group -+ * is not the root group. We must not touch the root entity -+ * as it must never become an in-service entity. -+ */ -+ bfqg_entity = bfqg->my_entity; -+ if (bfqg_entity) { -+ if (bfqg_entity->budget > next_in_service->budget) -+ ret = true; -+ bfqg_entity->budget = next_in_service->budget; -+ } -+ -+ return ret; -+} -+ -+/* -+ * This function tells whether entity stops being a candidate for next -+ * service, according to the following logic. -+ * -+ * This function is invoked for an entity that is about to be set in -+ * service. If such an entity is a queue, then the entity is no longer -+ * a candidate for next service (i.e, a candidate entity to serve -+ * after the in-service entity is expired). The function then returns -+ * true. -+ * -+ * In contrast, the entity could stil be a candidate for next service -+ * if it is not a queue, and has more than one child. In fact, even if -+ * one of its children is about to be set in service, other children -+ * may still be the next to serve. As a consequence, a non-queue -+ * entity is not a candidate for next-service only if it has only one -+ * child. And only if this condition holds, then the function returns -+ * true for a non-queue entity. -+ */ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ struct bfq_group *bfqg; -+ -+ if (bfq_entity_to_bfqq(entity)) -+ return true; -+ -+ bfqg = container_of(entity, struct bfq_group, entity); -+ -+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); -+ BUG_ON(bfqg->active_entities == 0); -+ if (bfqg->active_entities == 1) -+ return true; -+ -+ return false; -+} -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = NULL) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (parent = NULL; entity ; entity = parent) -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ return false; -+} -+ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ return true; -+} -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+/* -+ * Shift for timestamp calculations. This actually limits the maximum -+ * service allowed in one timestamp delta (small shift values increase it), -+ * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time -+ * wraparounds. -+ */ -+#define WFQ_SERVICE_SHIFT 22 -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = NULL; -+ -+ BUG_ON(!entity); -+ -+ if (!entity->my_sched_data) -+ bfqq = container_of(entity, struct bfq_queue, entity); -+ -+ return bfqq; -+} -+ -+ -+/** -+ * bfq_delta - map service into the virtual time domain. -+ * @service: amount of service. -+ * @weight: scale factor (weight of an entity or weight sum). -+ */ -+static u64 bfq_delta(unsigned long service, unsigned long weight) -+{ -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; -+ -+ do_div(d, weight); -+ return d; -+} -+ -+/** -+ * bfq_calc_finish - assign the finish time to an entity. -+ * @entity: the entity to act upon. -+ * @service: the service to be charged to the entity. -+ */ -+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned long long start, finish, delta; -+ -+ BUG_ON(entity->weight == 0); -+ -+ entity->finish = entity->start + -+ bfq_delta(service, entity->weight); -+ -+ start = ((entity->start>>10)*1000)>>12; -+ finish = ((entity->finish>>10)*1000)>>12; -+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#endif -+ } -+} -+ -+/** -+ * bfq_entity_of - get an entity from a node. -+ * @node: the node field of the entity. -+ * -+ * Convert a node pointer to the relative entity. This is used only -+ * to simplify the logic of some functions and not as the generic -+ * conversion mechanism because, e.g., in the tree walking functions, -+ * the check for a %NULL value would be redundant. -+ */ -+static struct bfq_entity *bfq_entity_of(struct rb_node *node) -+{ -+ struct bfq_entity *entity = NULL; -+ -+ if (node) -+ entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ return entity; -+} -+ -+/** -+ * bfq_extract - remove an entity from a tree. -+ * @root: the tree root. -+ * @entity: the entity to remove. -+ */ -+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -+{ -+ BUG_ON(entity->tree != root); -+ -+ entity->tree = NULL; -+ rb_erase(&entity->rb_node, root); -+} -+ -+/** -+ * bfq_idle_extract - extract an entity from the idle tree. -+ * @st: the service tree of the owning @entity. -+ * @entity: the entity being removed. -+ */ -+static void bfq_idle_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *next; -+ -+ BUG_ON(entity->tree != &st->idle); -+ -+ if (entity == st->first_idle) { -+ next = rb_next(&entity->rb_node); -+ st->first_idle = bfq_entity_of(next); -+ } -+ -+ if (entity == st->last_idle) { -+ next = rb_prev(&entity->rb_node); -+ st->last_idle = bfq_entity_of(next); -+ } -+ -+ bfq_extract(&st->idle, entity); -+ -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+} -+ -+/** -+ * bfq_insert - generic tree insertion. -+ * @root: tree root. -+ * @entity: entity to insert. -+ * -+ * This is used for the idle and the active tree, since they are both -+ * ordered by finish time. -+ */ -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -+{ -+ struct bfq_entity *entry; -+ struct rb_node **node = &root->rb_node; -+ struct rb_node *parent = NULL; -+ -+ BUG_ON(entity->tree); -+ -+ while (*node) { -+ parent = *node; -+ entry = rb_entry(parent, struct bfq_entity, rb_node); -+ -+ if (bfq_gt(entry->finish, entity->finish)) -+ node = &parent->rb_left; -+ else -+ node = &parent->rb_right; -+ } -+ -+ rb_link_node(&entity->rb_node, parent, node); -+ rb_insert_color(&entity->rb_node, root); -+ -+ entity->tree = root; -+} -+ -+/** -+ * bfq_update_min - update the min_start field of a entity. -+ * @entity: the entity to update. -+ * @node: one of its children. -+ * -+ * This function is called when @entity may store an invalid value for -+ * min_start due to updates to the active tree. The function assumes -+ * that the subtree rooted at @node (which may be its left or its right -+ * child) has a valid min_start value. -+ */ -+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -+{ -+ struct bfq_entity *child; -+ -+ if (node) { -+ child = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entity->min_start, child->min_start)) -+ entity->min_start = child->min_start; -+ } -+} -+ -+/** -+ * bfq_update_active_node - recalculate min_start. -+ * @node: the node to update. -+ * -+ * @node may have changed position or one of its children may have moved, -+ * this function updates its min_start value. The left and right subtrees -+ * are assumed to hold a correct min_start value. -+ */ -+static void bfq_update_active_node(struct rb_node *node) -+{ -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->min_start = entity->start; -+ bfq_update_min(entity, node->rb_right); -+ bfq_update_min(entity, node->rb_left); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#endif -+ } -+} -+ -+/** -+ * bfq_update_active_tree - update min_start for the whole active tree. -+ * @node: the starting node. -+ * -+ * @node must be the deepest modified node after an update. This function -+ * updates its min_start using the values held by its children, assuming -+ * that they did not change, and then updates all the nodes that may have -+ * changed in the path to the root. The only nodes that may have changed -+ * are the ones in the path or their siblings. -+ */ -+static void bfq_update_active_tree(struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+up: -+ bfq_update_active_node(node); -+ -+ parent = rb_parent(node); -+ if (!parent) -+ return; -+ -+ if (node == parent->rb_left && parent->rb_right) -+ bfq_update_active_node(parent->rb_right); -+ else if (parent->rb_left) -+ bfq_update_active_node(parent->rb_left); -+ -+ node = parent; -+ goto up; -+} -+ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+ -+/** -+ * bfq_active_insert - insert an entity in the active tree of its -+ * group/device. -+ * @st: the service tree of the entity. -+ * @entity: the entity being inserted. -+ * -+ * The active tree is ordered by finish time, but an extra key is kept -+ * per each node, containing the minimum value for the start times of -+ * its children (and the node itself), so it's possible to search for -+ * the eligible node with the lowest finish time in logarithmic time. -+ */ -+static void bfq_active_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node = &entity->rb_node; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ bfq_insert(&st->active, entity); -+ -+ if (node->rb_left) -+ node = node->rb_left; -+ else if (node->rb_right) -+ node = node->rb_right; -+ -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ bfqg->active_entities++; -+ } -+#endif -+} -+ -+/** -+ * bfq_ioprio_to_weight - calc a weight from an ioprio. -+ * @ioprio: the ioprio value to convert. -+ */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) -+{ -+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; -+} -+ -+/** -+ * bfq_weight_to_ioprio - calc an ioprio from a weight. -+ * @weight: the weight value to convert. -+ * -+ * To preserve as much as possible the old only-ioprio user interface, -+ * 0 is used as an escape ioprio value for weights (numerically) equal or -+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. -+ */ -+static unsigned short bfq_weight_to_ioprio(int weight) -+{ -+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? -+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -+} -+ -+static void bfq_get_entity(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ if (bfqq) { -+ bfqq->ref++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfqq, bfqq->ref); -+ } -+} -+ -+/** -+ * bfq_find_deepest - find the deepest node that an extraction can modify. -+ * @node: the node being removed. -+ * -+ * Do the first step of an extraction in an rb tree, looking for the -+ * node that will replace @node, and returning the deepest node that -+ * the following modifications to the tree can touch. If @node is the -+ * last node in the tree return %NULL. -+ */ -+static struct rb_node *bfq_find_deepest(struct rb_node *node) -+{ -+ struct rb_node *deepest; -+ -+ if (!node->rb_right && !node->rb_left) -+ deepest = rb_parent(node); -+ else if (!node->rb_right) -+ deepest = node->rb_left; -+ else if (!node->rb_left) -+ deepest = node->rb_right; -+ else { -+ deepest = rb_next(node); -+ if (deepest->rb_right) -+ deepest = deepest->rb_right; -+ else if (rb_parent(deepest) != node) -+ deepest = rb_parent(deepest); -+ } -+ -+ return deepest; -+} -+ -+/** -+ * bfq_active_extract - remove an entity from the active tree. -+ * @st: the service_tree containing the tree. -+ * @entity: the entity being removed. -+ */ -+static void bfq_active_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ node = bfq_find_deepest(&entity->rb_node); -+ bfq_extract(&st->active, entity); -+ -+ if (node) -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_remove(bfqd, entity, -+ &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ BUG_ON(!bfqg->active_entities); -+ bfqg->active_entities--; -+ } -+#endif -+} -+ -+/** -+ * bfq_idle_insert - insert an entity into the idle tree. -+ * @st: the service tree containing the tree. -+ * @entity: the entity to insert. -+ */ -+static void bfq_idle_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) -+ st->first_idle = entity; -+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) -+ st->last_idle = entity; -+ -+ bfq_insert(&st->idle, entity); -+ -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -+} -+ -+/** -+ * bfq_forget_entity - do not consider entity any longer for scheduling -+ * @st: the service tree. -+ * @entity: the entity being removed. -+ * @is_in_service: true if entity is currently the in-service entity. -+ * -+ * Forget everything about @entity. In addition, if entity represents -+ * a queue, and the latter is not in service, then release the service -+ * reference to the queue (the one taken through bfq_get_entity). In -+ * fact, in this case, there is really no more service reference to -+ * the queue, as the latter is also outside any service tree. If, -+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service -+ * will take care of putting the reference when the queue finally -+ * stops being served. -+ */ -+static void bfq_forget_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity, -+ bool is_in_service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!entity->on_st); -+ -+ entity->on_st = false; -+ st->wsum -= entity->weight; -+ if (bfqq && !is_in_service) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/** -+ * bfq_put_idle_entity - release the idle tree ref of an entity. -+ * @st: service tree for the entity. -+ * @entity: the entity being released. -+ */ -+static void bfq_put_idle_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ bfq_idle_extract(st, entity); -+ bfq_forget_entity(st, entity, -+ entity == entity->sched_data->in_service_entity); -+} -+ -+/** -+ * bfq_forget_idle - update the idle tree if necessary. -+ * @st: the service tree to act upon. -+ * -+ * To preserve the global O(log N) complexity we only remove one entry here; -+ * as the idle tree will not grow indefinitely this can be done safely. -+ */ -+static void bfq_forget_idle(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (RB_EMPTY_ROOT(&st->active) && last_idle && -+ !bfq_gt(last_idle->finish, st->vtime)) { -+ /* -+ * Forget the whole idle tree, increasing the vtime past -+ * the last finish time of idle entities. -+ */ -+ st->vtime = last_idle->finish; -+ } -+ -+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) -+ bfq_put_idle_entity(st, first_idle); -+} -+ -+/* -+ * Update weight and priority of entity. If update_class_too is true, -+ * then update the ioprio_class of entity too. -+ * -+ * The reason why the update of ioprio_class is controlled through the -+ * last parameter is as follows. Changing the ioprio class of an -+ * entity implies changing the destination service trees for that -+ * entity. If such a change occurred when the entity is already on one -+ * of the service trees for its previous class, then the state of the -+ * entity would become more complex: none of the new possible service -+ * trees for the entity, according to bfq_entity_service_tree(), would -+ * match any of the possible service trees on which the entity -+ * is. Complex operations involving these trees, such as entity -+ * activations and deactivations, should take into account this -+ * additional complexity. To avoid this issue, this function is -+ * invoked with update_class_too unset in the points in the code where -+ * entity may happen to be on some tree. -+ */ -+static struct bfq_service_tree * -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, -+ struct bfq_entity *entity, -+ bool update_class_too) -+{ -+ struct bfq_service_tree *new_st = old_st; -+ -+ if (entity->prio_changed) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int prev_weight, new_weight; -+ struct bfq_data *bfqd = NULL; -+ struct rb_root *root; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd; -+ struct bfq_group *bfqg; -+#endif -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ sd = entity->my_sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+ BUG_ON(!bfqd); -+ } -+#endif -+ -+ BUG_ON(old_st->wsum < entity->weight); -+ old_st->wsum -= entity->weight; -+ -+ if (entity->new_weight != entity->orig_weight) { -+ if (entity->new_weight < BFQ_MIN_WEIGHT || -+ entity->new_weight > BFQ_MAX_WEIGHT) { -+ pr_crit("update_weight_prio: new_weight %d\n", -+ entity->new_weight); -+ if (entity->new_weight < BFQ_MIN_WEIGHT) -+ entity->new_weight = BFQ_MIN_WEIGHT; -+ else -+ entity->new_weight = BFQ_MAX_WEIGHT; -+ } -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) -+ bfqq->ioprio = -+ bfq_weight_to_ioprio(entity->orig_weight); -+ } -+ -+ if (bfqq && update_class_too) -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ -+ /* -+ * Reset prio_changed only if the ioprio_class change -+ * is not pending any longer. -+ */ -+ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) -+ entity->prio_changed = 0; -+ -+ /* -+ * NOTE: here we may be changing the weight too early, -+ * this will cause unfairness. The correct approach -+ * would have required additional complexity to defer -+ * weight changes to the proper time instants (i.e., -+ * when entity->finish <= old_st->vtime). -+ */ -+ new_st = bfq_entity_service_tree(entity); -+ -+ prev_weight = entity->weight; -+ new_weight = entity->orig_weight * -+ (bfqq ? bfqq->wr_coeff : 1); -+ /* -+ * If the weight of the entity changes, remove the entity -+ * from its old weight counter (if there is a counter -+ * associated with the entity), and add it to the counter -+ * associated with its new weight. -+ */ -+ if (prev_weight != new_weight) { -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "weight changed %d %d(%d %d)", -+ prev_weight, new_weight, -+ entity->orig_weight, -+ bfqq->wr_coeff); -+ -+ root = bfqq ? &bfqd->queue_weights_tree : -+ &bfqd->group_weights_tree; -+ bfq_weights_tree_remove(bfqd, entity, root); -+ } -+ entity->weight = new_weight; -+ /* -+ * Add the entity to its weights tree only if it is -+ * not associated with a weight-raised queue. -+ */ -+ if (prev_weight != new_weight && -+ (bfqq ? bfqq->wr_coeff == 1 : 1)) -+ /* If we get here, root has been initialized. */ -+ bfq_weights_tree_add(bfqd, entity, root); -+ -+ new_st->wsum += entity->weight; -+ -+ if (new_st != old_st) -+ entity->start = new_st->vtime; -+ } -+ -+ return new_st; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -+#endif -+ -+/** -+ * bfq_bfqq_served - update the scheduler status after selection for -+ * service. -+ * @bfqq: the queue being served. -+ * @served: bytes to transfer. -+ * -+ * NOTE: this can be optimized, as the timestamps of upper level entities -+ * are synchronized every time a new bfqq is selected for service. By now, -+ * we keep it to better check consistency. -+ */ -+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st; -+ -+ for_each_entity(entity) { -+ st = bfq_entity_service_tree(entity); -+ -+ entity->service += served; -+ -+ BUG_ON(st->wsum == 0); -+ -+ st->vtime += bfq_delta(served, st->wsum); -+ bfq_forget_idle(st); -+ } -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -+#endif -+ st = bfq_entity_service_tree(&bfqq->entity); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", -+ served, ((st->vtime>>10)*1000)>>12, st); -+} -+ -+/** -+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length -+ * of the time interval during which bfqq has been in -+ * service. -+ * @bfqd: the device -+ * @bfqq: the queue that needs a service update. -+ * @time_ms: the amount of time during which the queue has received service -+ * -+ * If a queue does not consume its budget fast enough, then providing -+ * the queue with service fairness may impair throughput, more or less -+ * severely. For this reason, queues that consume their budget slowly -+ * are provided with time fairness instead of service fairness. This -+ * goal is achieved through the BFQ scheduling engine, even if such an -+ * engine works in the service, and not in the time domain. The trick -+ * is charging these queues with an inflated amount of service, equal -+ * to the amount of service that they would have received during their -+ * service slot if they had been fast, i.e., if their requests had -+ * been dispatched at a rate equal to the estimated peak rate. -+ * -+ * It is worth noting that time fairness can cause important -+ * distortions in terms of bandwidth distribution, on devices with -+ * internal queueing. The reason is that I/O requests dispatched -+ * during the service slot of a queue may be served after that service -+ * slot is finished, and may have a total processing time loosely -+ * correlated with the duration of the service slot. This is -+ * especially true for short service slots. -+ */ -+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ unsigned long time_ms) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ int tot_serv_to_charge = entity->service; -+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); -+ -+ if (time_ms > 0 && time_ms < timeout_ms) -+ tot_serv_to_charge = -+ (bfqd->bfq_max_budget * time_ms) / timeout_ms; -+ -+ if (tot_serv_to_charge < entity->service) -+ tot_serv_to_charge = entity->service; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ time_ms, timeout_ms, entity->service, -+ tot_serv_to_charge, entity->budget); -+ -+ /* Increase budget to avoid inconsistencies */ -+ if (tot_serv_to_charge > entity->budget) -+ entity->budget = tot_serv_to_charge; -+ -+ bfq_bfqq_served(bfqq, -+ max_t(int, 0, tot_serv_to_charge - entity->service)); -+} -+ -+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, -+ struct bfq_service_tree *st, -+ bool backshifted) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ /* -+ * When this function is invoked, entity is not in any service -+ * tree, then it is safe to invoke next function with the last -+ * parameter set (see the comments on the function). -+ */ -+ st = __bfq_entity_update_weight_prio(st, entity, true); -+ bfq_calc_finish(entity, entity->budget); -+ -+ /* -+ * If some queues enjoy backshifting for a while, then their -+ * (virtual) finish timestamps may happen to become lower and -+ * lower than the system virtual time. In particular, if -+ * these queues often happen to be idle for short time -+ * periods, and during such time periods other queues with -+ * higher timestamps happen to be busy, then the backshifted -+ * timestamps of the former queues can become much lower than -+ * the system virtual time. In fact, to serve the queues with -+ * higher timestamps while the ones with lower timestamps are -+ * idle, the system virtual time may be pushed-up to much -+ * higher values than the finish timestamps of the idle -+ * queues. As a consequence, the finish timestamps of all new -+ * or newly activated queues may end up being much larger than -+ * those of lucky queues with backshifted timestamps. The -+ * latter queues may then monopolize the device for a lot of -+ * time. This would simply break service guarantees. -+ * -+ * To reduce this problem, push up a little bit the -+ * backshifted timestamps of the queue associated with this -+ * entity (only a queue can happen to have the backshifted -+ * flag set): just enough to let the finish timestamp of the -+ * queue be equal to the current value of the system virtual -+ * time. This may introduce a little unfairness among queues -+ * with backshifted timestamps, but it does not break -+ * worst-case fairness guarantees. -+ * -+ * As a special case, if bfqq is weight-raised, push up -+ * timestamps much less, to keep very low the probability that -+ * this push up causes the backshifted finish timestamps of -+ * weight-raised queues to become higher than the backshifted -+ * finish timestamps of non weight-raised queues. -+ */ -+ if (backshifted && bfq_gt(st->vtime, entity->finish)) { -+ unsigned long delta = st->vtime - entity->finish; -+ -+ if (bfqq) -+ delta /= bfqq->wr_coeff; -+ -+ entity->start += delta; -+ entity->finish += delta; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: new queue finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: new group finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#endif -+ } -+ } -+ -+ bfq_active_insert(st, entity); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: queue %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: group %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#endif -+ } -+ BUG_ON(RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(&st->active != &sd->service_tree->active && -+ &st->active != &(sd->service_tree+1)->active && -+ &st->active != &(sd->service_tree+2)->active); -+} -+ -+/** -+ * __bfq_activate_entity - handle activation of entity. -+ * @entity: the entity being activated. -+ * @non_blocking_wait_rq: true if entity was waiting for a request -+ * -+ * Called for a 'true' activation, i.e., if entity is not active and -+ * one of its children receives a new request. -+ * -+ * Basically, this function updates the timestamps of entity and -+ * inserts entity into its active tree, ater possible extracting it -+ * from its idle tree. -+ */ -+static void __bfq_activate_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ bool backshifted = false; -+ unsigned long long min_vstart; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ /* See comments on bfq_fqq_update_budg_for_activation */ -+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { -+ backshifted = true; -+ min_vstart = entity->finish; -+ } else -+ min_vstart = st->vtime; -+ -+ if (entity->tree == &st->idle) { -+ /* -+ * Must be on the idle tree, bfq_idle_extract() will -+ * check for that. -+ */ -+ bfq_idle_extract(st, entity); -+ entity->start = bfq_gt(min_vstart, entity->finish) ? -+ min_vstart : entity->finish; -+ } else { -+ /* -+ * The finish time of the entity may be invalid, and -+ * it is in the past for sure, otherwise the queue -+ * would have been on the idle tree. -+ */ -+ entity->start = min_vstart; -+ st->wsum += entity->weight; -+ /* -+ * entity is about to be inserted into a service tree, -+ * and then set in service: get a reference to make -+ * sure entity does not disappear until it is no -+ * longer in service or scheduled for service. -+ */ -+ bfq_get_entity(entity); -+ -+ BUG_ON(entity->on_st && bfqq); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ if (entity->on_st && !bfqq) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, -+ bfqg, -+ "activate bug, class %d in_service %p", -+ bfq_class_idx(entity), sd->in_service_entity); -+ } -+#endif -+ BUG_ON(entity->on_st && !bfqq); -+ entity->on_st = true; -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, backshifted); -+} -+ -+/** -+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. -+ * @entity: the entity being requeued or repositioned. -+ * -+ * Requeueing is needed if this entity stops being served, which -+ * happens if a leaf descendant entity has expired. On the other hand, -+ * repositioning is needed if the next_inservice_entity for the child -+ * entity has changed. See the comments inside the function for -+ * details. -+ * -+ * Basically, this function: 1) removes entity from its active tree if -+ * present there, 2) updates the timestamps of entity and 3) inserts -+ * entity back into its active tree (in the new, right position for -+ * the new values of the timestamps). -+ */ -+static void __bfq_requeue_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree != &st->active); -+ -+ if (entity == sd->in_service_entity) { -+ /* -+ * We are requeueing the current in-service entity, -+ * which may have to be done for one of the following -+ * reasons: -+ * - entity represents the in-service queue, and the -+ * in-service queue is being requeued after an -+ * expiration; -+ * - entity represents a group, and its budget has -+ * changed because one of its child entities has -+ * just been either activated or requeued for some -+ * reason; the timestamps of the entity need then to -+ * be updated, and the entity needs to be enqueued -+ * or repositioned accordingly. -+ * -+ * In particular, before requeueing, the start time of -+ * the entity must be moved forward to account for the -+ * service that the entity has received while in -+ * service. This is done by the next instructions. The -+ * finish time will then be updated according to this -+ * new value of the start time, and to the budget of -+ * the entity. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree != &st->active); -+ /* -+ * In addition, if the entity had more than one child -+ * when set in service, then was not extracted from -+ * the active tree. This implies that the position of -+ * the entity in the active tree may need to be -+ * changed now, because we have just updated the start -+ * time of the entity, and we will update its finish -+ * time in a moment (the requeueing is then, more -+ * precisely, a repositioning in this case). To -+ * implement this repositioning, we: 1) dequeue the -+ * entity here, 2) update the finish time and -+ * requeue the entity according to the new -+ * timestamps below. -+ */ -+ if (entity->tree) -+ bfq_active_extract(st, entity); -+ } else { /* The entity is already active, and not in service */ -+ /* -+ * In this case, this function gets called only if the -+ * next_in_service entity below this entity has -+ * changed, and this change has caused the budget of -+ * this entity to change, which, finally implies that -+ * the finish time of this entity must be -+ * updated. Such an update may cause the scheduling, -+ * i.e., the position in the active tree, of this -+ * entity to change. We handle this change by: 1) -+ * dequeueing the entity here, 2) updating the finish -+ * time and requeueing the entity according to the new -+ * timestamps below. This is the same approach as the -+ * non-extracted-entity sub-case above. -+ */ -+ bfq_active_extract(st, entity); -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, false); -+} -+ -+static void __bfq_activate_requeue_entity(struct bfq_entity *entity, -+ struct bfq_sched_data *sd, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (sd->in_service_entity == entity || entity->tree == &st->active) -+ /* -+ * in service or already queued on the active tree, -+ * requeue or reposition -+ */ -+ __bfq_requeue_entity(entity); -+ else -+ /* -+ * Not in service and not queued on its active tree: -+ * the activity is idle and this is a true activation. -+ */ -+ __bfq_activate_entity(entity, non_blocking_wait_rq); -+} -+ -+ -+/** -+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. -+ * @entity: the entity to activate. -+ * @non_blocking_wait_rq: true if this entity was waiting for a request -+ * @requeue: true if this is a requeue, which implies that bfqq is -+ * being expired; thus ALL its ancestors stop being served and must -+ * therefore be requeued -+ */ -+static void bfq_activate_requeue_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq, -+ bool requeue) -+{ -+ struct bfq_sched_data *sd; -+ -+ for_each_entity(entity) { -+ BUG_ON(!entity); -+ sd = entity->sched_data; -+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); -+ -+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); -+ -+ if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ BUG_ON(!sd->next_in_service); -+ break; -+ } -+ BUG_ON(!sd->next_in_service); -+ } -+} -+ -+/** -+ * __bfq_deactivate_entity - deactivate an entity from its service tree. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: if false, the entity will not be put into the -+ * idle tree. -+ * -+ * Deactivates an entity, independently from its previous state. Must -+ * be invoked only if entity is on a service tree. Extracts the entity -+ * from that tree, and if necessary and allowed, puts it on the idle -+ * tree. -+ */ -+static bool __bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st; -+ bool is_in_service; -+ -+ if (!entity->on_st) { /* entity never activated, or already inactive */ -+ BUG_ON(sd && entity == sd->in_service_entity); -+ return false; -+ } -+ -+ /* -+ * If we get here, then entity is active, which implies that -+ * bfq_group_set_parent has already been invoked for the group -+ * represented by entity. Therefore, the field -+ * entity->sched_data has been set, and we can safely use it. -+ */ -+ st = bfq_entity_service_tree(entity); -+ is_in_service = entity == sd->in_service_entity; -+ -+ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); -+ -+ if (is_in_service) -+ bfq_calc_finish(entity, entity->service); -+ -+ if (entity->tree == &st->active) -+ bfq_active_extract(st, entity); -+ else if (!is_in_service && entity->tree == &st->idle) -+ bfq_idle_extract(st, entity); -+ else if (entity->tree) -+ BUG(); -+ -+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity, is_in_service); -+ else -+ bfq_idle_insert(st, entity); -+ -+ return true; -+} -+ -+/** -+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: true if the entity can be put on the idle tree -+ */ -+static void bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree, -+ bool expiration) -+{ -+ struct bfq_sched_data *sd; -+ struct bfq_entity *parent = NULL; -+ -+ for_each_entity_safe(entity, parent) { -+ sd = entity->sched_data; -+ -+ BUG_ON(sd == NULL); /* -+ * It would mean that this is the -+ * root group. -+ */ -+ -+ BUG_ON(expiration && entity != sd->in_service_entity); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree == -+ &bfq_entity_service_tree(entity)->active && -+ !sd->next_in_service); -+ -+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { -+ /* -+ * entity is not in any tree any more, so -+ * this deactivation is a no-op, and there is -+ * nothing to change for upper-level entities -+ * (in case of expiration, this can never -+ * happen). -+ */ -+ BUG_ON(expiration); /* -+ * entity cannot be already out of -+ * any tree -+ */ -+ return; -+ } -+ -+ if (sd->next_in_service == entity) -+ /* -+ * entity was the next_in_service entity, -+ * then, since entity has just been -+ * deactivated, a new one must be found. -+ */ -+ bfq_update_next_in_service(sd, NULL); -+ -+ if (sd->next_in_service) { -+ /* -+ * The parent entity is still backlogged, -+ * because next_in_service is not NULL. So, no -+ * further upwards deactivation must be -+ * performed. Yet, next_in_service has -+ * changed. Then the schedule does need to be -+ * updated upwards. -+ */ -+ BUG_ON(sd->next_in_service == entity); -+ break; -+ } -+ -+ /* -+ * If we get here, then the parent is no more -+ * backlogged and we need to propagate the -+ * deactivation upwards. Thus let the loop go on. -+ */ -+ -+ /* -+ * Also let parent be queued into the idle tree on -+ * deactivation, to preserve service guarantees, and -+ * assuming that who invoked this function does not -+ * need parent entities too to be removed completely. -+ */ -+ ins_into_idle_tree = true; -+ } -+ -+ /* -+ * If the deactivation loop is fully executed, then there are -+ * no more entities to touch and next loop is not executed at -+ * all. Otherwise, requeue remaining entities if they are -+ * about to stop receiving service, or reposition them if this -+ * is not the case. -+ */ -+ entity = parent; -+ for_each_entity(entity) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ /* -+ * Invoke __bfq_requeue_entity on entity, even if -+ * already active, to requeue/reposition it in the -+ * active tree (because sd->next_in_service has -+ * changed) -+ */ -+ __bfq_requeue_entity(entity); -+ -+ sd = entity->sched_data; -+ BUG_ON(expiration && sd->in_service_entity != entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "invoking udpdate_next for this queue"); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "invoking udpdate_next for this entity"); -+ } -+#endif -+ if (!bfq_update_next_in_service(sd, entity) && -+ !expiration) -+ /* -+ * next_in_service unchanged or not causing -+ * any change in entity->parent->sd, and no -+ * requeueing needed for expiration: stop -+ * here. -+ */ -+ break; -+ } -+} -+ -+/** -+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, -+ * if needed, to have at least one entity eligible. -+ * @st: the service tree to act upon. -+ * -+ * Assumes that st is not empty. -+ */ -+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); -+ -+ if (bfq_gt(root_entity->min_start, st->vtime)) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(root_entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+ } -+#endif -+ return root_entity->min_start; -+ } -+ return st->vtime; -+} -+ -+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -+{ -+ if (new_value > st->vtime) { -+ st->vtime = new_value; -+ bfq_forget_idle(st); -+ } -+} -+ -+/** -+ * bfq_first_active_entity - find the eligible entity with -+ * the smallest finish time -+ * @st: the service tree to select from. -+ * @vtime: the system virtual to use as a reference for eligibility -+ * -+ * This function searches the first schedulable entity, starting from the -+ * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path on -+ * the right is followed only if a) the left subtree contains no eligible -+ * entities and b) no eligible entity has been found yet. -+ */ -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, -+ u64 vtime) -+{ -+ struct bfq_entity *entry, *first = NULL; -+ struct rb_node *node = st->active.rb_node; -+ -+ while (node) { -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+left: -+ if (!bfq_gt(entry->start, vtime)) -+ first = entry; -+ -+ BUG_ON(bfq_gt(entry->min_start, vtime)); -+ -+ if (node->rb_left) { -+ entry = rb_entry(node->rb_left, -+ struct bfq_entity, rb_node); -+ if (!bfq_gt(entry->min_start, vtime)) { -+ node = node->rb_left; -+ goto left; -+ } -+ } -+ if (first) -+ break; -+ node = node->rb_right; -+ } -+ -+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); -+ return first; -+} -+ -+/** -+ * __bfq_lookup_next_entity - return the first eligible entity in @st. -+ * @st: the service tree. -+ * -+ * If there is no in-service entity for the sched_data st belongs to, -+ * then return the entity that will be set in service if: -+ * 1) the parent entity this st belongs to is set in service; -+ * 2) no entity belonging to such parent entity undergoes a state change -+ * that would influence the timestamps of the entity (e.g., becomes idle, -+ * becomes backlogged, changes its budget, ...). -+ * -+ * In this first case, update the virtual time in @st too (see the -+ * comments on this update inside the function). -+ * -+ * In constrast, if there is an in-service entity, then return the -+ * entity that would be set in service if not only the above -+ * conditions, but also the next one held true: the currently -+ * in-service entity, on expiration, -+ * 1) gets a finish time equal to the current one, or -+ * 2) is not eligible any more, or -+ * 3) is idle. -+ */ -+static struct bfq_entity * -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -+#if 0 -+ , bool force -+#endif -+ ) -+{ -+ struct bfq_entity *entity -+#if 0 -+ , *new_next_in_service = NULL -+#endif -+ ; -+ u64 new_vtime; -+ struct bfq_queue *bfqq; -+ -+ if (RB_EMPTY_ROOT(&st->active)) -+ return NULL; -+ -+ /* -+ * Get the value of the system virtual time for which at -+ * least one entity is eligible. -+ */ -+ new_vtime = bfq_calc_vtime_jump(st); -+ -+ /* -+ * If there is no in-service entity for the sched_data this -+ * active tree belongs to, then push the system virtual time -+ * up to the value that guarantees that at least one entity is -+ * eligible. If, instead, there is an in-service entity, then -+ * do not make any such update, because there is already an -+ * eligible entity, namely the in-service one (even if the -+ * entity is not on st, because it was extracted when set in -+ * service). -+ */ -+ if (!in_service) -+ bfq_update_vtime(st, new_vtime); -+ -+ entity = bfq_first_active_entity(st, new_vtime); -+ BUG_ON(bfq_gt(entity->start, new_vtime)); -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+ } -+#endif -+ -+ BUG_ON(!entity); -+ -+ return entity; -+} -+ -+/** -+ * bfq_lookup_next_entity - return the first eligible entity in @sd. -+ * @sd: the sched_data. -+ * -+ * This function is invoked when there has been a change in the trees -+ * for sd, and we need know what is the new next entity after this -+ * change. -+ */ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ int class_idx = 0; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ /* -+ * Choose from idle class, if needed to guarantee a minimum -+ * bandwidth to this class (and if there is some active entity -+ * in idle class). This should also mitigate -+ * priority-inversion problems in case a low priority task is -+ * holding file system resources. -+ */ -+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -+ BFQ_CL_IDLE_TIMEOUT)) { -+ if (!RB_EMPTY_ROOT(&idle_class_st->active)) -+ class_idx = BFQ_IOPRIO_CLASSES - 1; -+ /* About to be served if backlogged, or not yet backlogged */ -+ sd->bfq_class_idle_last_service = jiffies; -+ } -+ -+ /* -+ * Find the next entity to serve for the highest-priority -+ * class, unless the idle class needs to be served. -+ */ -+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ entity = __bfq_lookup_next_entity(st + class_idx, -+ sd->in_service_entity); -+ -+ if (entity) -+ break; -+ } -+ -+ BUG_ON(!entity && -+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || -+ !RB_EMPTY_ROOT(&(st+2)->active))); -+ -+ if (!entity) -+ return NULL; -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", -+ st + class_idx, class_idx); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen from st %p %d", -+ st + class_idx, class_idx); -+ } -+#endif -+ -+ return entity; -+} -+ -+static bool next_queue_may_preempt(struct bfq_data *bfqd) -+{ -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ return sd->next_in_service != sd->in_service_entity; -+} -+ -+/* -+ * Get next queue for service. -+ */ -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_sched_data *sd; -+ struct bfq_queue *bfqq; -+ -+ BUG_ON(bfqd->in_service_queue); -+ -+ if (bfqd->busy_queues == 0) -+ return NULL; -+ -+ /* -+ * Traverse the path from the root to the leaf entity to -+ * serve. Set in service all the entities visited along the -+ * way. -+ */ -+ sd = &bfqd->root_group->sched_data; -+ for (; sd ; sd = entity->my_sched_data) { -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: lookup in this group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in this group"); -+ } else { -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "get_next_queue: lookup in root group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in root group"); -+ } -+#endif -+ -+ BUG_ON(!sd->next_in_service); -+ -+ /* -+ * WARNING. We are about to set the in-service entity -+ * to sd->next_in_service, i.e., to the (cached) value -+ * returned by bfq_lookup_next_entity(sd) the last -+ * time it was invoked, i.e., the last time when the -+ * service order in sd changed as a consequence of the -+ * activation or deactivation of an entity. In this -+ * respect, if we execute bfq_lookup_next_entity(sd) -+ * in this very moment, it may, although with low -+ * probability, yield a different entity than that -+ * pointed to by sd->next_in_service. This rare event -+ * happens in case there was no CLASS_IDLE entity to -+ * serve for sd when bfq_lookup_next_entity(sd) was -+ * invoked for the last time, while there is now one -+ * such entity. -+ * -+ * If the above event happens, then the scheduling of -+ * such entity in CLASS_IDLE is postponed until the -+ * service of the sd->next_in_service entity -+ * finishes. In fact, when the latter is expired, -+ * bfq_lookup_next_entity(sd) gets called again, -+ * exactly to update sd->next_in_service. -+ */ -+ -+ /* Make next_in_service entity become in_service_entity */ -+ entity = sd->next_in_service; -+ sd->in_service_entity = entity; -+ -+ /* -+ * Reset the accumulator of the amount of service that -+ * the entity is about to receive. -+ */ -+ entity->service = 0; -+ -+ /* -+ * If entity is no longer a candidate for next -+ * service, then we extract it from its active tree, -+ * for the following reason. To further boost the -+ * throughput in some special case, BFQ needs to know -+ * which is the next candidate entity to serve, while -+ * there is already an entity in service. In this -+ * respect, to make it easy to compute/update the next -+ * candidate entity to serve after the current -+ * candidate has been set in service, there is a case -+ * where it is necessary to extract the current -+ * candidate from its service tree. Such a case is -+ * when the entity just set in service cannot be also -+ * a candidate for next service. Details about when -+ * this conditions holds are reported in the comments -+ * on the function bfq_no_longer_next_in_service() -+ * invoked below. -+ */ -+ if (bfq_no_longer_next_in_service(entity)) -+ bfq_active_extract(bfq_entity_service_tree(entity), -+ entity); -+ -+ /* -+ * For the same reason why we may have just extracted -+ * entity from its active tree, we may need to update -+ * next_in_service for the sched_data of entity too, -+ * regardless of whether entity has been extracted. -+ * In fact, even if entity has not been extracted, a -+ * descendant entity may get extracted. Such an event -+ * would cause a change in next_in_service for the -+ * level of the descendant entity, and thus possibly -+ * back to upper levels. -+ * -+ * We cannot perform the resulting needed update -+ * before the end of this loop, because, to know which -+ * is the correct next-to-serve candidate entity for -+ * each level, we need first to find the leaf entity -+ * to set in service. In fact, only after we know -+ * which is the next-to-serve leaf entity, we can -+ * discover whether the parent entity of the leaf -+ * entity becomes the next-to-serve, and so on. -+ */ -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_next_queue: this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ -+ } -+ -+ BUG_ON(!entity); -+ bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!bfqq); -+ -+ /* -+ * We can finally update all next-to-serve entities along the -+ * path from the leaf entity just set in service to the root. -+ */ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ if(!bfq_update_next_in_service(sd, NULL)) -+ break; -+ } -+ -+ return bfqq; -+} -+ -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; -+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; -+ struct bfq_entity *entity = in_serv_entity; -+ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+ -+ bfq_clear_bfqq_wait_request(in_serv_bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqd->in_service_queue = NULL; -+ -+ /* -+ * When this function is called, all in-service entities have -+ * been properly deactivated or requeued, so we can safely -+ * execute the final step: reset in_service_entity along the -+ * path from entity to the root. -+ */ -+ for_each_entity(entity) -+ entity->sched_data->in_service_entity = NULL; -+ -+ /* -+ * in_serv_entity is no longer in service, so, if it is in no -+ * service tree either, then release the service reference to -+ * the queue it represents (taken with bfq_get_entity). -+ */ -+ if (!in_serv_entity->on_st) -+ bfq_put_queue(in_serv_bfqq); -+} -+ -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool ins_into_idle_tree, bool expiration) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -+} -+ -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && -+ entity->on_st); -+ -+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -+ false); -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_requeue_entity(entity, false, -+ bfqq == bfqd->in_service_queue); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -+ -+/* -+ * Called when the bfqq no longer has requests pending, remove it from -+ * the service tree. As a special case, it can be invoked during an -+ * expiration. -+ */ -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) -+{ -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); -+ -+ bfq_clear_bfqq_busy(bfqq); -+ -+ BUG_ON(bfqd->busy_queues == 0); -+ bfqd->busy_queues--; -+ -+ if (!bfqq->dispatched) -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); -+} -+ -+/* -+ * Called when an inactive queue receives a new request. -+ */ -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); -+ -+ bfq_activate_bfqq(bfqd, bfqq); -+ -+ bfq_mark_bfqq_busy(bfqq); -+ bfqd->busy_queues++; -+ -+ if (!bfqq->dispatched) -+ if (bfqq->wr_coeff == 1) -+ bfq_weights_tree_add(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+} -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -new file mode 100644 -index 000000000000..65e7c7e77f3c ---- /dev/null -+++ b/block/bfq-sq-iosched.c -@@ -0,0 +1,5379 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include "blk.h" -+#include "bfq.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of entity to not be -+ * considered in bfq_differentiated_weights, which, in its -+ * turn, causes the scenario to be deemed wrongly symmetric in -+ * case entity's weight would have been the only weight making -+ * the scenario asymmetric. On the bright side, no unbalance -+ * will however occur when entity becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of entity). In fact, bfq_weights_tree_remove does nothing -+ * if !entity->weight_counter. -+ */ -+ if (unlikely(!entity->weight_counter)) -+ return; -+ -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); -+ -+ return dur; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_idle_window) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+ -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. -+ */ -+ bfqq->service_from_backlogged += entity->service; -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_greatest_from_now(); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_idle_window(bfqq), entity->weight); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (ref > 1 && !bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ if (bfqd->strict_guarantees) -+ return true; -+ -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable, or -+ * (b) regardless of the presence of NCQ, the device is rotational -+ * and the request pattern for bfqq is I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a) and (b) is true, and, in particular, -+ * happens to be false if bfqd is an NCQ-capable flash-based -+ * device. -+ */ -+ idling_boosts_thr = !bfqd->hw_tag || -+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -+ bfq_bfqq_idle_window(bfqq)); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There is then a case where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments on -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments on bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the return -+ * value of the function, which is true only if both the following -+ * conditions hold: -+ * 1) bfqq is sync, because idling make sense only for sync queues; -+ * 2) idling either boosts the throughput (without issues), or -+ * is necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ -+ return bfq_bfqq_sync(bfqq) && -+ (idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees); -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -+ bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ bfq_clear_bfqq_idle_window(bfqq); -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ if (!bfq_class_idle(bfqq)) -+ bfq_mark_bfqq_idle_window(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); -+} -+ -+/* -+ * Disable idle window if the process thinks too long or seeks so much that -+ * it doesn't matter. -+ */ -+static void bfq_update_idle_window(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ int enable_idle; -+ -+ /* Don't idle for async or idle io prio class. */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ enable_idle = bfq_bfqq_idle_window(bfqq); -+ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ bfqd->bfq_slice_idle == 0 || -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -+ bfqq->wr_coeff == 1)) -+ enable_idle = 0; -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -+ bfqq->wr_coeff == 1) -+ enable_idle = 0; -+ else -+ enable_idle = 1; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -+ enable_idle); -+ -+ if (enable_idle) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -+ !BFQQ_SEEKY(bfqq)) -+ bfq_update_idle_window(bfqd, bfqq, bic); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: idle_window=%d (seeky %d)", -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v8r12"; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ /* -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. -+ */ -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); -diff --git a/block/bfq.h b/block/bfq.h -new file mode 100644 -index 000000000000..f5751ea59d98 ---- /dev/null -+++ b/block/bfq.h -@@ -0,0 +1,948 @@ -+/* -+ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/blktrace_api.h> -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue on a hierarchical setup. @next_in_service -+ * points to the active entity of the sched_data service trees that -+ * will be scheduled next. It is used to reduce the number of steps -+ * needed for each hierarchical-schedule update. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the idle window before merging; taken to -+ * remember this value while the queue is merged, so as to be -+ * able to restore it in case of split. -+ */ -+ bool saved_idle_window; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ -+ struct rb_root group_weights_tree; -+ -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ -+ int busy_queues; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ -+ u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ -+ enum bfq_device_speed device_speed; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 8da66379f7ea..bf000c58644b 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 3 -+#define BLKCG_MAX_POLS 4 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - - -From 9916fed6c89c61a2b26053be04501784570bbec8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 20 Jul 2017 10:46:39 +0200 -Subject: [PATCH 02/51] Add extra checks related to entity scheduling - -- extra checks related to ioprioi-class changes -- specific check on st->idle in __bfq_requeue_entity - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index ac8991bca9fa..5ddf9af4261e 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -812,6 +812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - } - #endif - -+ BUG_ON(entity->tree && update_class_too); - BUG_ON(old_st->wsum < entity->weight); - old_st->wsum -= entity->weight; - -@@ -883,8 +884,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - - new_st->wsum += entity->weight; - -- if (new_st != old_st) -+ if (new_st != old_st) { -+ BUG_ON(!update_class_too); - entity->start = new_st->vtime; -+ } - } - - return new_st; -@@ -993,6 +996,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - * tree, then it is safe to invoke next function with the last - * parameter set (see the comments on the function). - */ -+ BUG_ON(entity->tree); - st = __bfq_entity_update_weight_prio(st, entity, true); - bfq_calc_finish(entity, entity->budget); - -@@ -1113,9 +1117,11 @@ static void __bfq_activate_entity(struct bfq_entity *entity, - * check for that. - */ - bfq_idle_extract(st, entity); -+ BUG_ON(entity->tree); - entity->start = bfq_gt(min_vstart, entity->finish) ? - min_vstart : entity->finish; - } else { -+ BUG_ON(entity->tree); - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue -@@ -1203,6 +1209,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree == &st->idle); - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child - -From 8f5b2c25dcbe31dda524e85b921b3aa1fe11d111 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 21 Jul 2017 12:08:57 +0200 -Subject: [PATCH 03/51] block, bfq: reset in_service_entity if it becomes idle - -BFQ implements hierarchical scheduling by representing each group of -queues with a generic parent entity. For each parent entity, BFQ -maintains an in_service_entity pointer: if one of the child entities -happens to be in service, in_service_entity points to it. The -resetting of these pointers happens only on queue expirations: when -the in-service queue is expired, i.e., stops to be the queue in -service, BFQ resets all in_service_entity pointers along the -parent-entity path from this queue to the root entity. - -Functions handling the scheduling of entities assume, naturally, that -in-service entities are active, i.e., have pending I/O requests (or, -as a special case, even if they have no pending requests, they are -expected to receive a new request very soon, with the scheduler idling -the storage device while waiting for such an event). Unfortunately, -the above resetting scheme of the in_service_entity pointers may cause -this assumption to be violated. For example, the in-service queue may -happen to remain without requests because of a request merge. In this -case the queue does become idle, and all related data structures are -updated accordingly. But in_service_entity still points to the queue -in the parent entity. This inconsistency may even propagate to -higher-level parent entities, if they happen to become idle as well, -as a consequence of the leaf queue becoming idle. For this queue and -parent entities, scheduling functions have an undefined behaviour, -and, as reported, may easily lead to kernel crashes or hangs. - -This commit addresses this issue by simply resetting the -in_service_entity field also when it is detected to point to an entity -becoming idle (regardless of why the entity becomes idle). - -Reported-by: Laurentiu Nicola <lnicola@dend.ro> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Laurentiu Nicola <lnicola@dend.ro> ---- - block/bfq-sched.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 5ddf9af4261e..a07a06eb5c72 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1336,8 +1336,10 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - - BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); - -- if (is_in_service) -+ if (is_in_service) { - bfq_calc_finish(entity, entity->service); -+ sd->in_service_entity = NULL; -+ } - - if (entity->tree == &st->active) - bfq_active_extract(st, entity); - -From 600ea668e2d340c95724bcf981d88812d6900342 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 28 Jul 2017 21:09:51 +0200 -Subject: [PATCH 04/51] block, bfq: consider also in_service_entity to state - whether an entity is active - -Groups of BFQ queues are represented by generic entities in BFQ. When -a queue belonging to a parent entity is deactivated, the parent entity -may need to be deactivated too, in case the deactivated queue was the -only active queue for the parent entity. This deactivation may need to -be propagated upwards if the entity belongs, in its turn, to a further -higher-level entity, and so on. In particular, the upward propagation -of deactivation stops at the first parent entity that remains active -even if one of its child entities has been deactivated. - -To decide whether the last non-deactivation condition holds for a -parent entity, BFQ checks whether the field next_in_service is still -not NULL for the parent entity, after the deactivation of one of its -child entity. If it is not NULL, then there are certainly other active -entities in the parent entity, and deactivations can stop. - -Unfortunately, this check misses a corner case: if in_service_entity -is not NULL, then next_in_service may happen to be NULL, although the -parent entity is evidently active. This happens if: 1) the entity -pointed by in_service_entity is the only active entity in the parent -entity, and 2) according to the definition of next_in_service, the -in_service_entity cannot be considered as next_in_service. See the -comments on the definition of next_in_service for details on this -second point. - -Hitting the above corner case causes crashes. - -To address this issue, this commit: -1) Extends the above check on only next_in_service to controlling both -next_in_service and in_service_entity (if any of them is not NULL, -then no further deactivation is performed) -2) Improves the (important) comments on how next_in_service is defined -and updated; in particular it fixes a few rather obscure paragraphs - -Reported-by: Eric Wheeler <bfq-sched@lists.ewheeler.net> -Reported-by: Rick Yiu <rick_yiu@htc.com> -Reported-by: Tom X Nguyen <tom81094@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Eric Wheeler <bfq-sched@lists.ewheeler.net> -Tested-by: Rick Yiu <rick_yiu@htc.com> -Tested-by: Laurentiu Nicola <lnicola@dend.ro> -Tested-by: Tom X Nguyen <tom81094@gmail.com> ---- - block/bfq-sched.c | 140 ++++++++++++++++++++++++++++++------------------------ - block/bfq.h | 23 +++++++-- - 2 files changed, 95 insertions(+), 68 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a07a06eb5c72..5c0f9290a79c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -196,21 +196,23 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) - - /* - * This function tells whether entity stops being a candidate for next -- * service, according to the following logic. -+ * service, according to the restrictive definition of the field -+ * next_in_service. In particular, this function is invoked for an -+ * entity that is about to be set in service. - * -- * This function is invoked for an entity that is about to be set in -- * service. If such an entity is a queue, then the entity is no longer -- * a candidate for next service (i.e, a candidate entity to serve -- * after the in-service entity is expired). The function then returns -- * true. -+ * If entity is a queue, then the entity is no longer a candidate for -+ * next service according to the that definition, because entity is -+ * about to become the in-service queue. This function then returns -+ * true if entity is a queue. - * -- * In contrast, the entity could stil be a candidate for next service -- * if it is not a queue, and has more than one child. In fact, even if -- * one of its children is about to be set in service, other children -- * may still be the next to serve. As a consequence, a non-queue -- * entity is not a candidate for next-service only if it has only one -- * child. And only if this condition holds, then the function returns -- * true for a non-queue entity. -+ * In contrast, entity could still be a candidate for next service if -+ * it is not a queue, and has more than one active child. In fact, -+ * even if one of its children is about to be set in service, other -+ * active children may still be the next to serve, for the parent -+ * entity, even according to the above definition. As a consequence, a -+ * non-queue entity is not a candidate for next-service only if it has -+ * only one active child. And only if this condition holds, then this -+ * function returns true for a non-queue entity. - */ - static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - { -@@ -223,6 +225,18 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - - BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); - BUG_ON(bfqg->active_entities == 0); -+ /* -+ * The field active_entities does not always contain the -+ * actual number of active children entities: it happens to -+ * not account for the in-service entity in case the latter is -+ * removed from its active tree (which may get done after -+ * invoking the function bfq_no_longer_next_in_service in -+ * bfq_get_next_queue). Fortunately, here, i.e., while -+ * bfq_no_longer_next_in_service is not yet completed in -+ * bfq_get_next_queue, bfq_active_extract has not yet been -+ * invoked, and thus active_entities still coincides with the -+ * actual number of active entities. -+ */ - if (bfqg->active_entities == 1) - return true; - -@@ -1089,7 +1103,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - * one of its children receives a new request. - * - * Basically, this function updates the timestamps of entity and -- * inserts entity into its active tree, ater possible extracting it -+ * inserts entity into its active tree, ater possibly extracting it - * from its idle tree. - */ - static void __bfq_activate_entity(struct bfq_entity *entity, -@@ -1213,7 +1227,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child -- * when set in service, then was not extracted from -+ * when set in service, then it was not extracted from - * the active tree. This implies that the position of - * the entity in the active tree may need to be - * changed now, because we have just updated the start -@@ -1221,9 +1235,8 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - * time in a moment (the requeueing is then, more - * precisely, a repositioning in this case). To - * implement this repositioning, we: 1) dequeue the -- * entity here, 2) update the finish time and -- * requeue the entity according to the new -- * timestamps below. -+ * entity here, 2) update the finish time and requeue -+ * the entity according to the new timestamps below. - */ - if (entity->tree) - bfq_active_extract(st, entity); -@@ -1270,9 +1283,9 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - - - /** -- * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -- * and activate, requeue or reposition all ancestors -- * for which such an update becomes necessary. -+ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. - * @entity: the entity to activate. - * @non_blocking_wait_rq: true if this entity was waiting for a request - * @requeue: true if this is a requeue, which implies that bfqq is -@@ -1308,9 +1321,9 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, - * @ins_into_idle_tree: if false, the entity will not be put into the - * idle tree. - * -- * Deactivates an entity, independently from its previous state. Must -+ * Deactivates an entity, independently of its previous state. Must - * be invoked only if entity is on a service tree. Extracts the entity -- * from that tree, and if necessary and allowed, puts it on the idle -+ * from that tree, and if necessary and allowed, puts it into the idle - * tree. - */ - static bool __bfq_deactivate_entity(struct bfq_entity *entity, -@@ -1359,7 +1372,7 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - /** - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. -- * @ins_into_idle_tree: true if the entity can be put on the idle tree -+ * @ins_into_idle_tree: true if the entity can be put into the idle tree - */ - static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, -@@ -1406,16 +1419,29 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - */ - bfq_update_next_in_service(sd, NULL); - -- if (sd->next_in_service) { -+ if (sd->next_in_service || sd->in_service_entity) { - /* -- * The parent entity is still backlogged, -- * because next_in_service is not NULL. So, no -- * further upwards deactivation must be -- * performed. Yet, next_in_service has -- * changed. Then the schedule does need to be -- * updated upwards. -+ * The parent entity is still active, because -+ * either next_in_service or in_service_entity -+ * is not NULL. So, no further upwards -+ * deactivation must be performed. Yet, -+ * next_in_service has changed. Then the -+ * schedule does need to be updated upwards. -+ * -+ * NOTE If in_service_entity is not NULL, then -+ * next_in_service may happen to be NULL, -+ * although the parent entity is evidently -+ * active. This happens if 1) the entity -+ * pointed by in_service_entity is the only -+ * active entity in the parent entity, and 2) -+ * according to the definition of -+ * next_in_service, the in_service_entity -+ * cannot be considered as -+ * next_in_service. See the comments on the -+ * definition of next_in_service for details. - */ - BUG_ON(sd->next_in_service == entity); -+ BUG_ON(sd->in_service_entity == entity); - break; - } - -@@ -1806,45 +1832,33 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - - /* - * If entity is no longer a candidate for next -- * service, then we extract it from its active tree, -- * for the following reason. To further boost the -- * throughput in some special case, BFQ needs to know -- * which is the next candidate entity to serve, while -- * there is already an entity in service. In this -- * respect, to make it easy to compute/update the next -- * candidate entity to serve after the current -- * candidate has been set in service, there is a case -- * where it is necessary to extract the current -- * candidate from its service tree. Such a case is -- * when the entity just set in service cannot be also -- * a candidate for next service. Details about when -- * this conditions holds are reported in the comments -- * on the function bfq_no_longer_next_in_service() -- * invoked below. -+ * service, then it must be extracted from its active -+ * tree, so as to make sure that it won't be -+ * considered when computing next_in_service. See the -+ * comments on the function -+ * bfq_no_longer_next_in_service() for details. - */ - if (bfq_no_longer_next_in_service(entity)) - bfq_active_extract(bfq_entity_service_tree(entity), - entity); - - /* -- * For the same reason why we may have just extracted -- * entity from its active tree, we may need to update -- * next_in_service for the sched_data of entity too, -- * regardless of whether entity has been extracted. -- * In fact, even if entity has not been extracted, a -- * descendant entity may get extracted. Such an event -- * would cause a change in next_in_service for the -- * level of the descendant entity, and thus possibly -- * back to upper levels. -+ * Even if entity is not to be extracted according to -+ * the above check, a descendant entity may get -+ * extracted in one of the next iterations of this -+ * loop. Such an event could cause a change in -+ * next_in_service for the level of the descendant -+ * entity, and thus possibly back to this level. - * -- * We cannot perform the resulting needed update -- * before the end of this loop, because, to know which -- * is the correct next-to-serve candidate entity for -- * each level, we need first to find the leaf entity -- * to set in service. In fact, only after we know -- * which is the next-to-serve leaf entity, we can -- * discover whether the parent entity of the leaf -- * entity becomes the next-to-serve, and so on. -+ * However, we cannot perform the resulting needed -+ * update of next_in_service for this level before the -+ * end of the whole loop, because, to know which is -+ * the correct next-to-serve candidate entity for each -+ * level, we need first to find the leaf entity to set -+ * in service. In fact, only after we know which is -+ * the next-to-serve leaf entity, we can discover -+ * whether the parent entity of the leaf entity -+ * becomes the next-to-serve, and so on. - */ - - /* Log some information */ -diff --git a/block/bfq.h b/block/bfq.h -index f5751ea59d98..ebd9688b9f61 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -68,17 +68,30 @@ struct bfq_service_tree { - * - * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as an -- * intermediate queue on a hierarchical setup. @next_in_service -- * points to the active entity of the sched_data service trees that -- * will be scheduled next. It is used to reduce the number of steps -- * needed for each hierarchical-schedule update. -+ * intermediate queue in a hierarchical setup. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. - * Requests from higher priority queues are served before all the - * requests from lower priority queues; among requests of the same - * queue requests are served according to B-WF2Q+. -- * All the fields are protected by the queue lock of the containing bfqd. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. - */ - struct bfq_sched_data { - struct bfq_entity *in_service_entity; /* entity in service */ - -From 6b5effd10bc6711a862e7cbd7cd2dd0146defa01 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 4 May 2017 10:53:43 +0200 -Subject: [PATCH 05/51] block, bfq: improve and refactor throughput-boosting - logic - -When a queue associated with a process remains empty, there are cases -where throughput gets boosted if the device is idled to await the -arrival of a new I/O request for that queue. Currently, BFQ assumes -that one of these cases is when the device has no internal queueing -(regardless of the properties of the I/O being served). Unfortunately, -this condition has proved to be too general. So, this commit refines it -as "the device has no internal queueing and is rotational". - -This refinement provides a significant throughput boost with random -I/O, on flash-based storage without internal queueing. For example, on -a HiKey board, throughput increases by up to 125%, growing, e.g., from -6.9MB/s to 15.6MB/s with two or three random readers in parallel. - -This commit also refactors the code related to device idling, for the -following reason. Finding the change that provides the above large -improvement has been slightly more difficult than it had to be, -because the logic that decides whether to idle the device is still -scattered across three functions. Almost all of the logic is in the -function bfq_bfqq_may_idle, but (1) part of the decision is made in -bfq_update_idle_window, and (2) the function bfq_bfqq_must_idle may -switch off idling regardless of the output of bfq_bfqq_may_idle. In -addition, both bfq_update_idle_window and bfq_bfqq_must_idle make -their decisions as a function of parameters that are used, for similar -purposes, also in bfq_bfqq_may_idle. This commit addresses this issue -by moving all the logic into bfq_bfqq_may_idle. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - block/bfq-sq-iosched.c | 141 +++++++++++++++++++++++++++---------------------- - block/bfq.h | 12 ++--- - 2 files changed, 83 insertions(+), 70 deletions(-) - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 65e7c7e77f3c..30d019fc67e0 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -684,10 +684,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - unsigned int old_wr_coeff; - bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - -- if (bic->saved_idle_window) -- bfq_mark_bfqq_idle_window(bfqq); -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); - else -- bfq_clear_bfqq_idle_window(bfqq); -+ bfq_clear_bfqq_has_short_ttime(bfqq); - - if (bic->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); -@@ -2047,7 +2047,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - if (!bic) - return; - -- bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -@@ -3214,9 +3214,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", - reason, slow, bfqq->dispatched, -- bfq_bfqq_idle_window(bfqq), entity->weight); -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* - * Increase, decrease or leave budget unchanged according to -@@ -3298,7 +3298,10 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - { - struct bfq_data *bfqd = bfqq->bfqd; -- bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr, idling_boosts_thr_without_issues, - idling_needed_for_service_guarantees, - asymmetric_scenario; - -@@ -3306,27 +3309,44 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - return true; - - /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* - * The next variable takes into account the cases where idling - * boosts the throughput. - * - * The value of the variable is computed considering, first, that - * idling is virtually always beneficial for the throughput if: -- * (a) the device is not NCQ-capable, or -- * (b) regardless of the presence of NCQ, the device is rotational -- * and the request pattern for bfqq is I/O-bound and sequential. -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. - * - * Secondly, and in contrast to the above item (b), idling an - * NCQ-capable flash-based device would not boost the - * throughput even with sequential I/O; rather it would lower - * the throughput in proportion to how fast the device - * is. Accordingly, the next variable is true if any of the -- * above conditions (a) and (b) is true, and, in particular, -- * happens to be false if bfqd is an NCQ-capable flash-based -- * device. -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. - */ -- idling_boosts_thr = !bfqd->hw_tag || -- (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -- bfq_bfqq_idle_window(bfqq)); -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); - - /* - * The value of the next variable, -@@ -3497,12 +3517,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); - - /* -- * We have now all the components we need to compute the return -- * value of the function, which is true only if both the following -- * conditions hold: -- * 1) bfqq is sync, because idling make sense only for sync queues; -- * 2) idling either boosts the throughput (without issues), or -- * is necessary to preserve service guarantees. -+ * We have now all the components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. - */ - bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); -@@ -3514,9 +3532,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - bfq_bfqq_IO_bound(bfqq), - idling_needed_for_service_guarantees); - -- return bfq_bfqq_sync(bfqq) && -- (idling_boosts_thr_without_issues || -- idling_needed_for_service_guarantees); -+ return idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees; - } - - /* -@@ -3532,10 +3549,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - */ - static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = bfqq->bfqd; -- -- return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -- bfq_bfqq_may_idle(bfqq); -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); - } - - /* -@@ -3994,7 +4008,6 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - case IOPRIO_CLASS_IDLE: - bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; -- bfq_clear_bfqq_idle_window(bfqq); - break; - } - -@@ -4058,8 +4071,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_set_next_ioprio_data(bfqq, bic); - - if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ - if (!bfq_class_idle(bfqq)) -- bfq_mark_bfqq_idle_window(bfqq); -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); - bfq_mark_bfqq_sync(bfqq); - bfq_mark_bfqq_just_created(bfqq); - } else -@@ -4195,18 +4214,19 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, - blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); - } - --/* -- * Disable idle window if the process thinks too long or seeks so much that -- * it doesn't matter. -- */ --static void bfq_update_idle_window(struct bfq_data *bfqd, -- struct bfq_queue *bfqq, -- struct bfq_io_cq *bic) -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) - { -- int enable_idle; -+ bool has_short_ttime = true; - -- /* Don't idle for async or idle io prio class. */ -- if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) - return; - - /* Idle window just restored, statistics are meaningless. */ -@@ -4214,27 +4234,22 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, - bfqd->bfq_wr_min_idle_time)) - return; - -- enable_idle = bfq_bfqq_idle_window(bfqq); -- -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -- bfqd->bfq_slice_idle == 0 || -- (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -- bfqq->wr_coeff == 1)) -- enable_idle = 0; -- else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -- if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -- bfqq->wr_coeff == 1) -- enable_idle = 0; -- else -- enable_idle = 1; -- } -- bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -- enable_idle); -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ has_short_ttime); - -- if (enable_idle) -- bfq_mark_bfqq_idle_window(bfqq); -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); - else -- bfq_clear_bfqq_idle_window(bfqq); -+ bfq_clear_bfqq_has_short_ttime(bfqq); - } - - /* -@@ -4250,14 +4265,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->meta_pending++; - - bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); -- if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -- !BFQQ_SEEKY(bfqq)) -- bfq_update_idle_window(bfqd, bfqq, bic); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: idle_window=%d (seeky %d)", -- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); -+ "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - -diff --git a/block/bfq.h b/block/bfq.h -index ebd9688b9f61..34fc4697fd89 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -349,11 +349,11 @@ struct bfq_io_cq { - #endif - - /* -- * Snapshot of the idle window before merging; taken to -- * remember this value while the queue is merged, so as to be -- * able to restore it in case of split. -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. - */ -- bool saved_idle_window; -+ bool saved_has_short_ttime; - /* - * Same purpose as the previous two fields for the I/O bound - * classification of a queue. -@@ -610,7 +610,7 @@ enum bfqq_state_flags { - */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -- BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_IO_bound, /* - * bfqq has timed-out at least once -@@ -649,7 +649,7 @@ BFQ_BFQQ_FNS(wait_request); - BFQ_BFQQ_FNS(non_blocking_wait_rq); - BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); --BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(has_short_ttime); - BFQ_BFQQ_FNS(sync); - BFQ_BFQQ_FNS(IO_bound); - BFQ_BFQQ_FNS(in_large_burst); - -From b5e746fa99d961a5642cffb27c19a77e8b638007 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 19 Dec 2016 16:59:33 +0100 -Subject: [PATCH 06/51] FIRST BFQ-MQ COMMIT: Copy bfq-sq-iosched.c as - bfq-mq-iosched.c - -This commit introduces bfq-mq-iosched.c, the main source file that -will contain the code of bfq for blk-mq. I name tentatively -bfq-mq this version of bfq. - -For the moment, the file bfq-mq-iosched.c is just a copy of -bfq-sq-iosched.c, i.e, of the main source file of bfq for blk. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 5392 ++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 5392 insertions(+) - create mode 100644 block/bfq-mq-iosched.c - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -new file mode 100644 -index 000000000000..30d019fc67e0 ---- /dev/null -+++ b/block/bfq-mq-iosched.c -@@ -0,0 +1,5392 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include "blk.h" -+#include "bfq.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of entity to not be -+ * considered in bfq_differentiated_weights, which, in its -+ * turn, causes the scenario to be deemed wrongly symmetric in -+ * case entity's weight would have been the only weight making -+ * the scenario asymmetric. On the bright side, no unbalance -+ * will however occur when entity becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of entity). In fact, bfq_weights_tree_remove does nothing -+ * if !entity->weight_counter. -+ */ -+ if (unlikely(!entity->weight_counter)) -+ return; -+ -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); -+ -+ return dur; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+ -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. -+ */ -+ bfqq->service_from_backlogged += entity->service; -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_greatest_from_now(); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (ref > 1 && !bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr, idling_boosts_thr_without_issues, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ if (bfqd->strict_guarantees) -+ return true; -+ -+ /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. -+ */ -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There is then a case where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments on -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments on bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ -+ return idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees; -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ -+ if (!bfq_class_idle(bfqq)) -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); -+} -+ -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ bool has_short_ttime = true; -+ -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ has_short_ttime); -+ -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v8r12"; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ /* -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. -+ */ -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); - -From e24d2e6461479dbd13d58be2dc44b23b5e24487c Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 19 Dec 2016 17:13:39 +0100 -Subject: [PATCH 07/51] Add config and build bits for bfq-mq-iosched - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/Kconfig.iosched | 10 +++++++++ - block/Makefile | 1 + - block/bfq-cgroup-included.c | 4 ++-- - block/bfq-mq-iosched.c | 25 ++++++++++++----------- - block/bfq-sched.c | 50 ++++++++++++++++++++++----------------------- - block/bfq-sq-iosched.c | 24 +++++++++++----------- - block/bfq.h | 36 +++++++++++++++++++++----------- - 8 files changed, 88 insertions(+), 64 deletions(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 9e3f4c2f7390..2d94af3d8b0a 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -96,6 +96,16 @@ config DEFAULT_IOSCHED - default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - -+config MQ_IOSCHED_BFQ -+ tristate "BFQ-MQ I/O Scheduler" -+ default y -+ ---help--- -+ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth -+ among all processes according to their weights, regardless of -+ the device parameters and with any workload. It also -+ guarantees a low latency to interactive and soft real-time -+ applications. Details in Documentation/block/bfq-iosched.txt -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/Makefile b/block/Makefile -index 59026b425791..a571329c23f0 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -25,6 +25,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o - obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o -+obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index af7c216a3540..9c483b658179 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -15,7 +15,7 @@ - * file. - */ - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - - /* bfqg stats flags */ - enum bfqg_stats_flags { -@@ -1116,7 +1116,7 @@ static struct cftype bfq_blkg_files[] = { - {} /* terminate */ - }; - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, unsigned int op) { } -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 30d019fc67e0..e88e00f1e0a7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -82,6 +82,7 @@ - #include <linux/rbtree.h> - #include <linux/ioprio.h> - #include "blk.h" -+#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ -@@ -387,7 +388,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || -@@ -1672,7 +1673,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - } - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -@@ -3879,7 +3880,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -@@ -3909,7 +3910,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_put(bfqg); - #endif - } -@@ -4835,7 +4836,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); -@@ -4850,7 +4851,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - { - int i; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -@@ -5265,7 +5266,7 @@ static struct elevator_type iosched_bfq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -@@ -5292,7 +5293,7 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, -@@ -5315,7 +5316,7 @@ static int __init bfq_init(void) - int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -@@ -5362,7 +5363,7 @@ static int __init bfq_init(void) - if (ret) - goto err_pol_unreg; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); - #endif - pr_info("%s", msg); -@@ -5370,7 +5371,7 @@ static int __init bfq_init(void) - return 0; - - err_pol_unreg: --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - return ret; -@@ -5379,7 +5380,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 5c0f9290a79c..b54a638186e3 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -136,7 +136,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_next_in_service: chosen this queue"); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(next_in_service, -@@ -149,7 +149,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - return parent_sched_may_change; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* both next loops stop at one of the child entities of the root group */ - #define for_each_entity(entity) \ - for (; entity ; entity = entity->parent) -@@ -243,7 +243,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - return false; - } - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - #define for_each_entity(entity) \ - for (; entity ; entity = NULL) - -@@ -260,7 +260,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - return true; - } - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - /* - * Shift for timestamp calculations. This actually limits the maximum -@@ -323,7 +323,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", - start, finish, delta); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -473,7 +473,7 @@ static void bfq_update_active_node(struct rb_node *node) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_active_node: new min_start %llu", - ((entity->min_start>>10)*1000)>>12); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -540,7 +540,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node = &entity->rb_node; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -@@ -555,7 +555,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - - bfq_update_active_tree(node); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); -@@ -563,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - #endif - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -@@ -652,7 +652,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -@@ -664,7 +664,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - if (node) - bfq_update_active_tree(node); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); -@@ -672,7 +672,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - #endif - if (bfqq) - list_del(&bfqq->bfqq_list); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_remove(bfqd, entity, -@@ -809,14 +809,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root *root; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; - #endif - - if (bfqq) - bfqd = bfqq->bfqd; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); -@@ -907,7 +907,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - return new_st; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); - #endif - -@@ -936,7 +936,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif - st = bfq_entity_service_tree(&bfqq->entity); -@@ -1060,7 +1060,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1078,7 +1078,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1153,7 +1153,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity, - - BUG_ON(entity->on_st && bfqq); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - if (entity->on_st && !bfqq) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, -@@ -1485,7 +1485,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "invoking udpdate_next for this queue"); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, -@@ -1525,7 +1525,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", - root_entity->min_start); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(root_entity, struct bfq_group, -@@ -1661,7 +1661,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - "__lookup_next: start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1735,7 +1735,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", - st + class_idx, class_idx); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1777,7 +1777,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - */ - sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - if (entity) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1867,7 +1867,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, - "get_next_queue: this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 30d019fc67e0..25da0d1c0622 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -387,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || -@@ -1672,7 +1672,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - } - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -@@ -3879,7 +3879,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -@@ -3909,7 +3909,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_put(bfqg); - #endif - } -@@ -4835,7 +4835,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); -@@ -4850,7 +4850,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - { - int i; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -@@ -5265,7 +5265,7 @@ static struct elevator_type iosched_bfq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -@@ -5292,7 +5292,7 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, -@@ -5315,7 +5315,7 @@ static int __init bfq_init(void) - int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -@@ -5362,7 +5362,7 @@ static int __init bfq_init(void) - if (ret) - goto err_pol_unreg; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); - #endif - pr_info("%s", msg); -@@ -5370,7 +5370,7 @@ static int __init bfq_init(void) - return 0; - - err_pol_unreg: --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - return ret; -@@ -5379,7 +5379,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -diff --git a/block/bfq.h b/block/bfq.h -index 34fc4697fd89..53954d1b87f8 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -19,6 +19,18 @@ - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ - #define BFQ_IOPRIO_CLASSES 3 - #define BFQ_CL_IDLE_TIMEOUT (HZ/5) - -@@ -344,7 +356,7 @@ struct bfq_io_cq { - struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ - int ioprio; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -@@ -671,7 +683,7 @@ static const char *checked_dev_name(const struct device *dev) - return nodev; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -@@ -696,7 +708,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - __pbuf, ##args); \ - } while (0) - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - pr_crit("%s bfq%d%c " fmt "\n", \ -@@ -705,7 +717,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ - pr_crit("%s bfq " fmt "\n", \ -@@ -713,7 +725,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -@@ -735,7 +747,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ - } while (0) - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -@@ -743,7 +755,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -@@ -763,7 +775,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -794,7 +806,7 @@ struct bfqg_stats { - #endif - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * -@@ -895,7 +907,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "entity_service_tree %p %d", - sched_data->service_tree + idx, idx); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -924,7 +936,7 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) - return bic->icq.q->elevator->elevator_data; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - - static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) - { -@@ -953,7 +965,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bfq_io_cq *bic); - static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); - #endif - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); - -From add91dbd756cf8ca3aa3add9a19eef742d5fca6b Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 20 Jan 2017 09:18:25 +0100 -Subject: [PATCH 08/51] Increase max policies for io controller - -To let bfq-mq policy be plugged too (however cgroups -suppport is not yet functional in bfq-mq). - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - include/linux/blkdev.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index bf000c58644b..10f892ca585d 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 4 -+#define BLKCG_MAX_POLS 5 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - - -From 2c39a1d9ab4516d44e01e96f19f578b927e7f2e9 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 19 Dec 2016 18:11:33 +0100 -Subject: [PATCH 09/51] Copy header file bfq.h as bfq-mq.h - -This commit introduces the header file bfq-mq.h, that will play -for bfq-mq-iosched.c the same role that bfq.h plays for bfq-iosched.c. - -For the moment, the file bfq-mq.h is just a copy of bfq.h. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-mq.h | 973 +++++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 974 insertions(+), 1 deletion(-) - create mode 100644 block/bfq-mq.h - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index e88e00f1e0a7..d1125aee658c 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -83,7 +83,7 @@ - #include <linux/ioprio.h> - #include "blk.h" - #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ --#include "bfq.h" -+#include "bfq-mq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -new file mode 100644 -index 000000000000..53954d1b87f8 ---- /dev/null -+++ b/block/bfq-mq.h -@@ -0,0 +1,973 @@ -+/* -+ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/blktrace_api.h> -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue in a hierarchical setup. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. -+ */ -+ bool saved_has_short_ttime; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ -+ struct rb_root group_weights_tree; -+ -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ -+ int busy_queues; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ -+ u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ -+ enum bfq_device_speed device_speed; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(has_short_ttime); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ - -From 0bd96428e086fd28800efdf5f0a5f62869af6e30 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 21 Jan 2017 12:41:14 +0100 -Subject: [PATCH 10/51] Move thinktime from bic to bfqq - -Prep change to make it possible to protect this field with a -scheduler lock. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 28 ++++++++++++++-------------- - block/bfq-mq.h | 30 ++++++++++++++++-------------- - 2 files changed, 30 insertions(+), 28 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index d1125aee658c..65f5dfb79417 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -698,6 +698,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - if (unlikely(busy)) - old_wr_coeff = bfqq->wr_coeff; - -+ bfqq->ttime = bic->saved_ttime; - bfqq->wr_coeff = bic->saved_wr_coeff; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -@@ -1287,7 +1288,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - * details on the usage of the next variable. - */ - arrived_in_time = ktime_get_ns() <= -- RQ_BIC(rq)->ttime.last_end_request + -+ bfqq->ttime.last_end_request + - bfqd->bfq_slice_idle * 3; - - bfq_log_bfqq(bfqd, bfqq, -@@ -2048,6 +2049,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - if (!bic) - return; - -+ bic->saved_ttime = bfqq->ttime; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -@@ -3948,11 +3950,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_put_queue(bfqq); /* release process reference */ - } - --static void bfq_init_icq(struct io_cq *icq) --{ -- icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); --} -- - static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); -@@ -4084,6 +4081,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_mark_bfqq_just_created(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); -+ -+ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+ - bfq_mark_bfqq_IO_bound(bfqq); - - /* Tentative initial value to trade off between thr and lat */ -@@ -4191,14 +4191,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - } - - static void bfq_update_io_thinktime(struct bfq_data *bfqd, -- struct bfq_io_cq *bic) -+ struct bfq_queue *bfqq) - { -- struct bfq_ttime *ttime = &bic->ttime; -- u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ struct bfq_ttime *ttime = &bfqq->ttime; -+ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; - - elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); - -- ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; - ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); - ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, - ttime->ttime_samples); -@@ -4240,8 +4240,8 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - * decide whether to mark as has_short_ttime - */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -- (bfq_sample_valid(bic->ttime.ttime_samples) && -- bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ (bfq_sample_valid(bfqq->ttime.ttime_samples) && -+ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - - bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -@@ -4265,7 +4265,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (rq->cmd_flags & REQ_META) - bfqq->meta_pending++; - -- bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_thinktime(bfqd, bfqq); - bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); - -@@ -4436,7 +4436,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - - now_ns = ktime_get_ns(); - -- RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ bfqq->ttime.last_end_request = now_ns; - - /* - * Using us instead of ns, to get a reasonable precision in -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 53954d1b87f8..0f51f270469c 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -210,6 +210,18 @@ struct bfq_entity { - struct bfq_group; - - /** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** - * struct bfq_queue - leaf schedulable entity. - * - * A bfq_queue is a leaf request queue; it can be associated with an -@@ -270,6 +282,9 @@ struct bfq_queue { - /* node for active/idle bfqq list inside parent bfqd */ - struct list_head bfqq_list; - -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ - /* bit vector: a 1 for each seeky requests in history */ - u32 seek_history; - -@@ -333,18 +348,6 @@ struct bfq_queue { - }; - - /** -- * struct bfq_ttime - per process thinktime stats. -- */ --struct bfq_ttime { -- u64 last_end_request; /* completion time of last request */ -- -- u64 ttime_total; /* total process thinktime */ -- unsigned long ttime_samples; /* number of thinktime samples */ -- u64 ttime_mean; /* average process thinktime */ -- --}; -- --/** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ - struct bfq_io_cq { -@@ -352,8 +355,6 @@ struct bfq_io_cq { - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; -- /* associated @bfq_ttime struct */ -- struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ - int ioprio; - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -390,6 +391,7 @@ struct bfq_io_cq { - unsigned long saved_last_wr_start_finish; - unsigned long saved_wr_start_at_switch_to_srt; - unsigned int saved_wr_cur_max_time; -+ struct bfq_ttime saved_ttime; - }; - - enum bfq_device_speed { - -From 351a9aea7c0c9c30edacdbf2a3c0d089470de1e8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 18 Jan 2017 11:42:22 +0100 -Subject: [PATCH 11/51] Embed bfq-ioc.c and add locking on request queue - -The version of bfq-ioc.c for bfq-iosched.c is not correct any more for -bfq-mq, because, in bfq-mq, the request queue lock is not being held -when bfq_bic_lookup is invoked. That function must then take that look -on its own. This commit removes the inclusion of bfq-ioc.c, copies the -content of bfq-ioc.c into bfq-mq-iosched.c, and adds the grabbing of -the lock. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 39 ++++++++++++++++++++++++++++++++++++--- - 1 file changed, 36 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 65f5dfb79417..756a618d5902 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -195,7 +195,39 @@ static int device_speed_thresh[2]; - - static void bfq_schedule_dispatch(struct bfq_data *bfqd); - --#include "bfq-ioc.c" -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * @q: the request queue. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc, -+ struct request_queue *q) -+{ -+ if (ioc) { -+ struct bfq_io_cq *icq; -+ -+ spin_lock_irq(q->queue_lock); -+ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -+ spin_unlock_irq(q->queue_lock); -+ -+ return icq; -+ } -+ -+ return NULL; -+} -+ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" - -@@ -1520,13 +1552,14 @@ static void bfq_add_request(struct request *rq) - } - - static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -- struct bio *bio) -+ struct bio *bio, -+ struct request_queue *q) - { - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - -- bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ bic = bfq_bic_lookup(bfqd, tsk->io_context, q); - if (!bic) - return NULL; - - -From ed0d64e27b2308813a2a846139e405e0479f0849 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 20 Dec 2016 09:07:19 +0100 -Subject: [PATCH 12/51] Modify interface and operation to comply with - blk-mq-sched - -As for modifications of the operation, the major changes are the introduction -of a scheduler lock, and the moving to deferred work of the body of the hook -exit_icq. The latter change has been made to avoid deadlocks caused by the -combination of the following facts: 1) such a body takes the scheduler lock, -and, if not deferred, 2) it does so from inside the exit_icq hook, which is -invoked with the queue lock held, and 3) there is at least one code path, -namely that starting from bfq_bio_merge, which takes these locks in the -opposite order. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 4 - - block/bfq-mq-iosched.c | 695 ++++++++++++++++++++++++-------------------- - block/bfq-mq.h | 35 +-- - 3 files changed, 394 insertions(+), 340 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 9c483b658179..8a73de76f32b 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -472,8 +472,6 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct bfq_group *bfqg, *parent; - struct bfq_entity *entity; - -- assert_spin_locked(bfqd->queue->queue_lock); -- - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - - if (unlikely(!bfqg)) -@@ -602,8 +600,6 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_group *bfqg; - struct bfq_entity *entity; - -- lockdep_assert_held(bfqd->queue->queue_lock); -- - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 756a618d5902..c963d92a32c2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -81,7 +81,13 @@ - #include <linux/jiffies.h> - #include <linux/rbtree.h> - #include <linux/ioprio.h> -+#include <linux/sbitmap.h> -+#include <linux/delay.h> -+ - #include "blk.h" -+#include "blk-mq.h" -+#include "blk-mq-tag.h" -+#include "blk-mq-sched.h" - #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq-mq.h" - -@@ -193,8 +199,6 @@ static int device_speed_thresh[2]; - #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - --static void bfq_schedule_dispatch(struct bfq_data *bfqd); -- - /** - * icq_to_bic - convert iocontext queue structure to bfq_io_cq. - * @icq: the iocontext queue. -@@ -216,11 +220,12 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct request_queue *q) - { - if (ioc) { -+ unsigned long flags; - struct bfq_io_cq *icq; - -- spin_lock_irq(q->queue_lock); -+ spin_lock_irqsave(q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -- spin_unlock_irq(q->queue_lock); -+ spin_unlock_irqrestore(q->queue_lock, flags); - - return icq; - } -@@ -244,7 +249,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { - bfq_log(bfqd, "schedule dispatch"); -- kblockd_schedule_work(&bfqd->unplug_work); -+ blk_mq_run_hw_queues(bfqd->queue, true); - } - } - -@@ -768,9 +773,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - { - int process_refs, io_refs; - -- lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -- -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ io_refs = bfqq->allocated; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; -@@ -1584,6 +1587,7 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) - return sdist; - } - -+#if 0 /* Still not clear if we can do without next two functions */ - static void bfq_activate_request(struct request_queue *q, struct request *rq) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -@@ -1597,8 +1601,10 @@ static void bfq_deactivate_request(struct request_queue *q, struct request *rq) - BUG_ON(bfqd->rq_in_driver == 0); - bfqd->rq_in_driver--; - } -+#endif - --static void bfq_remove_request(struct request *rq) -+static void bfq_remove_request(struct request_queue *q, -+ struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; -@@ -1619,6 +1625,10 @@ static void bfq_remove_request(struct request *rq) - bfqd->queued--; - elv_rb_del(&bfqq->sort_list, rq); - -+ elv_rqhash_del(q, rq); -+ if (q->last_merge == rq) -+ q->last_merge = NULL; -+ - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - bfqq->next_rq = NULL; - -@@ -1659,13 +1669,36 @@ static void bfq_remove_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); - } - --static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -- struct bio *bio) -+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *free = NULL; -+ bool ret; -+ -+ spin_lock_irq(&bfqd->lock); -+ ret = blk_mq_sched_try_merge(q, bio, &free); -+ -+ /* -+ * XXX Not yet freeing without lock held, to avoid an -+ * inconsistency with respect to the lock-protected invocation -+ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting -+ * for clarifications from Jens. -+ */ -+ if (free) -+ blk_mq_free_request(free); -+ spin_unlock_irq(&bfqd->lock); -+ -+ return ret; -+} -+ -+static int bfq_request_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - -- __rq = bfq_find_rq_fmerge(bfqd, bio); -+ __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; -@@ -1674,7 +1707,7 @@ static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, - return ELEVATOR_NO_MERGE; - } - --static void bfq_merged_request(struct request_queue *q, struct request *req, -+static void bfq_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) - { - if (type == ELEVATOR_FRONT_MERGE && -@@ -1689,6 +1722,8 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); - elv_rb_add(&bfqq->sort_list, req); -+ -+ spin_lock_irq(&bfqd->lock); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -@@ -1704,22 +1739,19 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ spin_unlock_irq(&bfqd->lock); - } - } - --#ifdef BFQ_GROUP_IOSCHED_ENABLED --static void bfq_bio_merged(struct request_queue *q, struct request *req, -- struct bio *bio) --{ -- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); --} --#endif -- --static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+static void bfq_requests_merged(struct request_queue *q, struct request *rq, - struct request *next) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - -+ if (!RB_EMPTY_NODE(&rq->rb_node)) -+ goto end; -+ spin_lock_irq(&bfqq->bfqd->lock); -+ - /* - * If next and rq belong to the same bfq_queue and next is older - * than rq, then reposition rq in the fifo (by substituting next -@@ -1740,7 +1772,10 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, - if (bfqq->next_rq == next) - bfqq->next_rq = rq; - -- bfq_remove_request(next); -+ bfq_remove_request(q, next); -+ -+ spin_unlock_irq(&bfqq->bfqd->lock); -+end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); - } - -@@ -1786,7 +1821,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq; - -- spin_lock_irq(bfqd->queue->queue_lock); -+ spin_lock_irq(&bfqd->lock); - - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); -@@ -1794,7 +1829,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) - bfq_bfqq_end_wr(bfqq); - bfq_end_wr_async(bfqd); - -- spin_unlock_irq(bfqd->queue->queue_lock); -+ spin_unlock_irq(&bfqd->lock); - } - - static sector_t bfq_io_struct_pos(void *io_struct, bool request) -@@ -2184,8 +2219,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfq_put_queue(bfqq); - } - --static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); -@@ -2203,7 +2238,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * merge only if rq is queued there. - * Queue lock is held here. - */ -- bic = bfq_bic_lookup(bfqd, current->io_context); -+ bic = bfq_bic_lookup(bfqd, current->io_context, q); - if (!bic) - return false; - -@@ -2228,12 +2263,6 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - return bfqq == RQ_BFQQ(rq); - } - --static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -- struct request *next) --{ -- return RQ_BFQQ(rq) == RQ_BFQQ(next); --} -- - /* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the throughput. -@@ -2264,7 +2293,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - { - if (bfqq) { - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -- bfq_mark_bfqq_must_alloc(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -@@ -2703,27 +2731,28 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - } - - /* -- * Move request from internal lists to the dispatch list of the request queue -+ * Remove request from internal lists. - */ --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - /* -- * For consistency, the next instruction should have been executed -- * after removing the request from the queue and dispatching it. -- * We execute instead this instruction before bfq_remove_request() -- * (and hence introduce a temporary inconsistency), for efficiency. -- * In fact, in a forced_dispatch, this prevents two counters related -- * to bfqq->dispatched to risk to be uselessly decremented if bfqq -- * is not in service, and then to be incremented again after -- * incrementing bfqq->dispatched. -+ * For consistency, the next instruction should have been -+ * executed after removing the request from the queue and -+ * dispatching it. We execute instead this instruction before -+ * bfq_remove_request() (and hence introduce a temporary -+ * inconsistency), for efficiency. In fact, should this -+ * dispatch occur for a non in-service bfqq, this anticipated -+ * increment prevents two counters related to bfqq->dispatched -+ * from risking to be, first, uselessly decremented, and then -+ * incremented again when the (new) value of bfqq->dispatched -+ * happens to be taken into account. - */ - bfqq->dispatched++; - bfq_update_peak_rate(q->elevator->elevator_data, rq); - -- bfq_remove_request(rq); -- elv_dispatch_sort(q, rq); -+ bfq_remove_request(q, rq); - } - - static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -@@ -3605,7 +3634,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && -- !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_wait_request(bfqq) && - !bfq_bfqq_must_idle(bfqq)) - goto expire; - -@@ -3641,7 +3670,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * arrives. - */ - if (bfq_bfqq_wait_request(bfqq)) { -- BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the -@@ -3668,7 +3696,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * for a new request, or has requests waiting for a completion and - * may idle after their completion, then keep it anyway. - */ -- if (hrtimer_active(&bfqd->idle_slice_timer) || -+ if (bfq_bfqq_wait_request(bfqq) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { - bfqq = NULL; - goto keep_queue; -@@ -3753,13 +3781,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - } - - /* -- * Dispatch one request from bfqq, moving it to the request queue -- * dispatch list. -+ * Dispatch next request from bfqq. - */ --static int bfq_dispatch_request(struct bfq_data *bfqd, -- struct bfq_queue *bfqq) -+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) - { -- int dispatched = 0; - struct request *rq = bfqq->next_rq; - unsigned long service_to_charge; - -@@ -3775,7 +3801,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - -- bfq_dispatch_insert(bfqd->queue, rq); -+ bfq_dispatch_remove(bfqd->queue, rq); - - /* - * If weight raising has to terminate for bfqq, then next -@@ -3791,86 +3817,61 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - bfq_update_wr_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, -- "dispatched %u sec req (%llu), budg left %d", -+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", - blk_rq_sectors(rq), - (unsigned long long) blk_rq_pos(rq), -- bfq_bfqq_budget_left(bfqq)); -- -- dispatched++; -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->dispatched); - - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); - } - -+ /* -+ * Expire bfqq, pretending that its budget expired, if bfqq -+ * belongs to CLASS_IDLE and other queues are waiting for -+ * service. -+ */ - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - -- return dispatched; -+ return rq; - - expire: - bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -- return dispatched; --} -- --static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) --{ -- int dispatched = 0; -- -- while (bfqq->next_rq) { -- bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -- dispatched++; -- } -- -- BUG_ON(!list_empty(&bfqq->fifo)); -- return dispatched; -+ return rq; - } - --/* -- * Drain our current requests. -- * Used for barriers and when switching io schedulers on-the-fly. -- */ --static int bfq_forced_dispatch(struct bfq_data *bfqd) -+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { -- struct bfq_queue *bfqq, *n; -- struct bfq_service_tree *st; -- int dispatched = 0; -- -- bfqq = bfqd->in_service_queue; -- if (bfqq) -- __bfq_bfqq_expire(bfqd, bfqq); -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - - /* -- * Loop through classes, and be careful to leave the scheduler -- * in a consistent state, as feedback mechanisms and vtime -- * updates cannot be disabled during the process. -+ * Avoiding lock: a race on bfqd->busy_queues should cause at -+ * most a call to dispatch for nothing - */ -- list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -- st = bfq_entity_service_tree(&bfqq->entity); -- -- dispatched += __bfq_forced_dispatch_bfqq(bfqq); -- -- bfqq->max_budget = bfq_max_budget(bfqd); -- bfq_forget_idle(st); -- } -- -- BUG_ON(bfqd->busy_queues != 0); -- -- return dispatched; -+ return !list_empty_careful(&bfqd->dispatch) || -+ bfqd->busy_queues > 0; - } - --static int bfq_dispatch_requests(struct request_queue *q, int force) -+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq = NULL; -+ struct bfq_queue *bfqq = NULL; -+ -+ if (!list_empty(&bfqd->dispatch)) { -+ rq = list_first_entry(&bfqd->dispatch, struct request, -+ queuelist); -+ list_del_init(&rq->queuelist); -+ goto exit; -+ } - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) -- return 0; -- -- if (unlikely(force)) -- return bfq_forced_dispatch(bfqd); -+ goto exit; - - /* - * Force device to serve one request at a time if -@@ -3885,25 +3886,39 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - * throughput. - */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -- return 0; -+ goto exit; - - bfqq = bfq_select_queue(bfqd); - if (!bfqq) -- return 0; -+ goto exit; - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfq_bfqq_wait_request(bfqq)); - -- if (!bfq_dispatch_request(bfqd, bfqq)) -- return 0; -- -- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -- bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); - - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); -- return 1; -+exit: -+ if (rq) { -+ rq->rq_flags |= RQF_STARTED; -+ bfqd->rq_in_driver++; -+ } -+ -+ return rq; -+} -+ -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ -+ spin_lock_irq(&bfqd->lock); -+ rq = __bfq_dispatch_request(hctx); -+ spin_unlock_irq(&bfqd->lock); -+ -+ return rq; - } - - /* -@@ -3921,13 +3936,14 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ - bfqq->ref--; - if (bfqq->ref) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); -- BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -@@ -3942,7 +3958,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - */ - hlist_del_init(&bfqq->burst_list_node); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -3983,29 +4000,52 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_put_queue(bfqq); /* release process reference */ - } - --static void bfq_exit_icq(struct io_cq *icq) -+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - { -- struct bfq_io_cq *bic = icq_to_bic(icq); -- struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ struct bfq_data *bfqd; - -- if (bic_to_bfqq(bic, false)) { -- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -- bic_set_bfqq(bic, NULL, false); -- } -+ if (bfqq) -+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - -- if (bic_to_bfqq(bic, true)) { -+ if (bfqq && bfqd) { -+ spin_lock_irq(&bfqd->lock); - /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. - */ -- if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -- put_io_context(icq->ioc); -- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -- bic_set_bfqq(bic, NULL, true); -+ if (is_sync && bfq_bfqq_coop(bfqq)) -+ put_io_context(bic->icq.ioc); -+ bfq_exit_bfqq(bfqd, bfqq); -+ bic_set_bfqq(bic, NULL, is_sync); -+ spin_unlock_irq(&bfqd->lock); - } - } - -+static void bfq_exit_icq_body(struct work_struct *work) -+{ -+ struct bfq_io_cq *bic = -+ container_of(work, struct bfq_io_cq, exit_icq_work); -+ -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ kblockd_schedule_work(&bic->exit_icq_work); -+} -+ - /* - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. -@@ -4015,6 +4055,10 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - { - struct task_struct *tsk = current; - int ioprio_class; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ if (!bfqd) -+ return; - - ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - switch (ioprio_class) { -@@ -4095,6 +4139,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -+ spin_lock_init(&bfqq->lock); -+ - bfqq->ref = 0; - bfqq->bfqd = bfqd; - -@@ -4351,22 +4397,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (budget_timeout) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); -- -- /* -- * Let the request rip immediately, or let a new queue be -- * selected if bfqq has just been expired. -- */ -- __blk_run_queue(bfqd->queue); - } - } - --static void bfq_insert_request(struct request_queue *q, struct request *rq) -+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - -- assert_spin_locked(bfqd->queue->queue_lock); -- - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to -@@ -4381,8 +4418,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - * Release the request's reference to the old bfqq - * and make sure one is taken to the shared queue. - */ -- new_bfqq->allocated[rq_data_dir(rq)]++; -- bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->allocated++; -+ bfqq->allocated--; - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -@@ -4406,6 +4443,55 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - bfq_rq_enqueued(bfqd, bfqq, rq); - } - -+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -+ bool at_head) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ spin_lock_irq(&bfqd->lock); -+ if (blk_mq_sched_try_insert_merge(q, rq)) -+ goto done; -+ spin_unlock_irq(&bfqd->lock); -+ -+ blk_mq_sched_request_inserted(rq); -+ -+ spin_lock_irq(&bfqd->lock); -+ if (at_head || blk_rq_is_passthrough(rq)) { -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (at_head) -+ list_add(&rq->queuelist, &bfqd->dispatch); -+ else -+ list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ if (bfqq) -+ bfqq->dispatched++; -+ } else { -+ __bfq_insert_request(bfqd, rq); -+ -+ if (rq_mergeable(rq)) { -+ elv_rqhash_add(q, rq); -+ if (!q->last_merge) -+ q->last_merge = rq; -+ } -+ } -+done: -+ spin_unlock_irq(&bfqd->lock); -+} -+ -+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -+ struct list_head *list, bool at_head) -+{ -+ while (!list_empty(list)) { -+ struct request *rq; -+ -+ rq = list_first_entry(list, struct request, queuelist); -+ list_del_init(&rq->queuelist); -+ bfq_insert_request(hctx, rq, at_head); -+ } -+} -+ - static void bfq_update_hw_tag(struct bfq_data *bfqd) - { - bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -@@ -4431,27 +4517,17 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) - bfqd->hw_tag_samples = 0; - } - --static void bfq_completed_request(struct request_queue *q, struct request *rq) -+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_data *bfqd = bfqq->bfqd; - u64 now_ns; - u32 delta_us; - -- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -- blk_rq_sectors(rq)); -- -- assert_spin_locked(bfqd->queue->queue_lock); - bfq_update_hw_tag(bfqd); - - BUG_ON(!bfqd->rq_in_driver); - BUG_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; -- bfqg_stats_update_completion(bfqq_group(bfqq), -- rq_start_time_ns(rq), -- rq_io_start_time_ns(rq), -- rq->cmd_flags); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -@@ -4477,7 +4553,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - -- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -@@ -4527,7 +4604,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - if (bfqd->in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); -- goto out; -+ return; - } else if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); -@@ -4537,68 +4614,55 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_NO_MORE_REQUESTS); - } -- -- if (!bfqd->rq_in_driver) -- bfq_schedule_dispatch(bfqd); -- --out: -- return; - } - --static int __bfq_may_queue(struct bfq_queue *bfqq) -+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - { -- if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -- bfq_clear_bfqq_must_alloc(bfqq); -- return ELV_MQUEUE_MUST; -- } -+ bfqq->allocated--; - -- return ELV_MQUEUE_MAY; -+ bfq_put_queue(bfqq); - } - --static int bfq_may_queue(struct request_queue *q, unsigned int op) -+static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct task_struct *tsk = current; -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq; -- -- /* -- * Don't force setup of a queue from here, as a call to may_queue -- * does not necessarily imply that a request actually will be -- * queued. So just lookup a possibly existing queue, or return -- * 'may queue' if that fails. -- */ -- bic = bfq_bic_lookup(bfqd, tsk->io_context); -- if (!bic) -- return ELV_MQUEUE_MAY; -- -- bfqq = bic_to_bfqq(bic, op_is_sync(op)); -- if (bfqq) -- return __bfq_may_queue(bfqq); -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; - -- return ELV_MQUEUE_MAY; --} -+ if (rq->rq_flags & RQF_STARTED) -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); - --/* -- * Queue lock held here. -- */ --static void bfq_put_request(struct request *rq) --{ -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ if (likely(rq->rq_flags & RQF_STARTED)) { -+ unsigned long flags; - -- if (bfqq) { -- const int rw = rq_data_dir(rq); -+ spin_lock_irqsave(&bfqd->lock, flags); - -- BUG_ON(!bfqq->allocated[rw]); -- bfqq->allocated[rw]--; -+ bfq_completed_request(bfqq, bfqd); -+ bfq_put_rq_priv_body(bfqq); - -- rq->elv.priv[0] = NULL; -- rq->elv.priv[1] = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } else { -+ /* -+ * Request rq may be still/already in the scheduler, -+ * in which case we need to remove it. And we cannot -+ * defer such a check and removal, to avoid -+ * inconsistencies in the time interval from the end -+ * of this function to the start of the deferred work. -+ * Fortunately, this situation occurs only in process -+ * context, so taking the scheduler lock does not -+ * cause any deadlock, even if other locks are already -+ * (correctly) held by this process. -+ */ - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -- bfqq, bfqq->ref); -- bfq_put_queue(bfqq); -+ if (!RB_EMPTY_NODE(&rq->rb_node)) -+ bfq_remove_request(q, rq); -+ bfq_put_rq_priv_body(bfqq); - } -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; - } - - /* -@@ -4630,18 +4694,16 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - /* - * Allocate bfq data structures associated with this request. - */ --static int bfq_set_request(struct request_queue *q, struct request *rq, -- struct bio *bio, gfp_t gfp_mask) -+static int bfq_get_rq_private(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -- const int rw = rq_data_dir(rq); - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; -- unsigned long flags; - bool bfqq_already_existing = false, split = false; - -- spin_lock_irqsave(q->queue_lock, flags); -+ spin_lock_irq(&bfqd->lock); - - if (!bic) - goto queue_fail; -@@ -4661,7 +4723,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: was_in_list %d " -+ "get_request: was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, -@@ -4671,12 +4733,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: marking in " -+ "get_request: marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: clearing in " -+ "get_request: clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -@@ -4703,9 +4765,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - } - } - -- bfqq->allocated[rw]++; -+ bfqq->allocated++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "get_request: new allocated %d", bfqq->allocated); -+ - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -4733,26 +4798,53 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- spin_unlock_irqrestore(q->queue_lock, flags); -+ spin_unlock_irq(&bfqd->lock); - - return 0; - - queue_fail: -- bfq_schedule_dispatch(bfqd); -- spin_unlock_irqrestore(q->queue_lock, flags); -+ spin_unlock_irq(&bfqd->lock); - - return 1; - } - --static void bfq_kick_queue(struct work_struct *work) -+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = -- container_of(work, struct bfq_data, unplug_work); -- struct request_queue *q = bfqd->queue; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ enum bfqq_expiration reason; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_clear_bfqq_wait_request(bfqq); - -- spin_lock_irq(q->queue_lock); -- __blk_run_queue(q); -- spin_unlock_irq(q->queue_lock); -+ if (bfqq != bfqd->in_service_queue) { -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ return; -+ } -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ -+schedule_dispatch: -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_schedule_dispatch(bfqd); - } - - /* -@@ -4763,59 +4855,22 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - { - struct bfq_data *bfqd = container_of(timer, struct bfq_data, - idle_slice_timer); -- struct bfq_queue *bfqq; -- unsigned long flags; -- enum bfqq_expiration reason; -- -- spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ struct bfq_queue *bfqq = bfqd->in_service_queue; - -- bfqq = bfqd->in_service_queue; - /* - * Theoretical race here: the in-service queue can be NULL or -- * different from the queue that was idling if the timer handler -- * spins on the queue_lock and a new request arrives for the -- * current queue and there is a full dispatch cycle that changes -- * the in-service queue. This can hardly happen, but in the worst -- * case we just expire a queue too early. -+ * different from the queue that was idling if a new request -+ * arrives for the current queue and there is a full dispatch -+ * cycle that changes the in-service queue. This can hardly -+ * happen, but in the worst case we just expire a queue too -+ * early. - */ -- if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -- bfq_clear_bfqq_wait_request(bfqq); -- -- if (bfq_bfqq_budget_timeout(bfqq)) -- /* -- * Also here the queue can be safely expired -- * for budget timeout without wasting -- * guarantees -- */ -- reason = BFQ_BFQQ_BUDGET_TIMEOUT; -- else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -- /* -- * The queue may not be empty upon timer expiration, -- * because we may not disable the timer when the -- * first request of the in-service queue arrives -- * during disk idling. -- */ -- reason = BFQ_BFQQ_TOO_IDLE; -- else -- goto schedule_dispatch; -- -- bfq_bfqq_expire(bfqd, bfqq, true, reason); -- } -- --schedule_dispatch: -- bfq_schedule_dispatch(bfqd); -+ if (bfqq) -+ bfq_idle_slice_timer_body(bfqq); - -- spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); - return HRTIMER_NORESTART; - } - --static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) --{ -- hrtimer_cancel(&bfqd->idle_slice_timer); -- cancel_work_sync(&bfqd->unplug_work); --} -- - static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_queue **bfqq_ptr) - { -@@ -4852,28 +4907,40 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) - static void bfq_exit_queue(struct elevator_queue *e) - { - struct bfq_data *bfqd = e->elevator_data; -- struct request_queue *q = bfqd->queue; - struct bfq_queue *bfqq, *n; - -- bfq_shutdown_timer_wq(bfqd); -- -- spin_lock_irq(q->queue_lock); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(bfqd->in_service_queue); -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -- bfq_deactivate_bfqq(bfqd, bfqq, false, false); - -- spin_unlock_irq(q->queue_lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ if (bfqq->bic) /* bfqqs without bic are handled below */ -+ cancel_work_sync(&bfqq->bic->exit_icq_work); -+ } -+ -+ spin_lock_irq(&bfqd->lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ /* -+ * Make sure that deferred exit_icq_work completes -+ * without errors for bfq_queues without bic -+ */ -+ if (!bfqq->bic) -+ bfqq->bfqd = NULL; -+ } -+ spin_unlock_irq(&bfqd->lock); - -- bfq_shutdown_timer_wq(bfqd); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else -+ spin_lock_irq(&bfqd->lock); - bfq_put_async_queues(bfqd, bfqd->root_group); - kfree(bfqd->root_group); -+ spin_unlock_irq(&bfqd->lock); - #endif - - kfree(bfqd); -@@ -4934,10 +5001,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - - bfqd->queue = q; - -- spin_lock_irq(q->queue_lock); -- q->elevator = eq; -- spin_unlock_irq(q->queue_lock); -- - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); - if (!bfqd->root_group) - goto out_free; -@@ -4951,8 +5014,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; - -- INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -- - INIT_LIST_HEAD(&bfqd->active_list); - INIT_LIST_HEAD(&bfqd->idle_list); - INIT_HLIST_HEAD(&bfqd->burst_list); -@@ -5001,6 +5062,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - bfqd->device_speed = BFQ_BFQD_FAST; - -+ spin_lock_init(&bfqd->lock); -+ INIT_LIST_HEAD(&bfqd->dispatch); -+ -+ q->elevator = eq; -+ - return 0; - - out_free: -@@ -5057,7 +5123,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", - bfqd->queued); - -- spin_lock_irq(bfqd->queue->queue_lock); -+ spin_lock_irq(&bfqd->lock); - - num_char += sprintf(page + num_char, "Active:\n"); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -@@ -5086,7 +5152,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - -- spin_unlock_irq(bfqd->queue->queue_lock); -+ spin_unlock_irq(&bfqd->lock); - - return num_char; - } -@@ -5294,35 +5360,31 @@ static struct elv_fs_entry bfq_attrs[] = { - __ATTR_NULL - }; - --static struct elevator_type iosched_bfq = { -- .ops.sq = { -- .elevator_merge_fn = bfq_merge, -- .elevator_merged_fn = bfq_merged_request, -- .elevator_merge_req_fn = bfq_merged_requests, --#ifdef BFQ_GROUP_IOSCHED_ENABLED -- .elevator_bio_merged_fn = bfq_bio_merged, --#endif -- .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -- .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -- .elevator_dispatch_fn = bfq_dispatch_requests, -- .elevator_add_req_fn = bfq_insert_request, -- .elevator_activate_req_fn = bfq_activate_request, -- .elevator_deactivate_req_fn = bfq_deactivate_request, -- .elevator_completed_req_fn = bfq_completed_request, -- .elevator_former_req_fn = elv_rb_former_request, -- .elevator_latter_req_fn = elv_rb_latter_request, -- .elevator_init_icq_fn = bfq_init_icq, -- .elevator_exit_icq_fn = bfq_exit_icq, -- .elevator_set_req_fn = bfq_set_request, -- .elevator_put_req_fn = bfq_put_request, -- .elevator_may_queue_fn = bfq_may_queue, -- .elevator_init_fn = bfq_init_queue, -- .elevator_exit_fn = bfq_exit_queue, -+static struct elevator_type iosched_bfq_mq = { -+ .ops.mq = { -+ .get_rq_priv = bfq_get_rq_private, -+ .put_rq_priv = bfq_put_rq_private, -+ .init_icq = bfq_init_icq, -+ .exit_icq = bfq_exit_icq, -+ .insert_requests = bfq_insert_requests, -+ .dispatch_request = bfq_dispatch_request, -+ .next_request = elv_rb_latter_request, -+ .former_request = elv_rb_former_request, -+ .allow_merge = bfq_allow_bio_merge, -+ .bio_merge = bfq_bio_merge, -+ .request_merge = bfq_request_merge, -+ .requests_merged = bfq_requests_merged, -+ .request_merged = bfq_request_merged, -+ .has_work = bfq_has_work, -+ .init_sched = bfq_init_queue, -+ .exit_sched = bfq_exit_queue, - }, -+ -+ .uses_mq = true, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - .elevator_attrs = bfq_attrs, -- .elevator_name = "bfq-sq", -+ .elevator_name = "bfq-mq", - .elevator_owner = THIS_MODULE, - }; - -@@ -5392,7 +5454,7 @@ static int __init bfq_init(void) - device_speed_thresh[0] = (4 * R_slow[0]) / 3; - device_speed_thresh[1] = (4 * R_slow[1]) / 3; - -- ret = elv_register(&iosched_bfq); -+ ret = elv_register(&iosched_bfq_mq); - if (ret) - goto err_pol_unreg; - -@@ -5412,8 +5474,8 @@ static int __init bfq_init(void) - - static void __exit bfq_exit(void) - { -- elv_unregister(&iosched_bfq); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ elv_unregister(&iosched_bfq_mq); -+#ifdef CONFIG_BFQ_GROUP_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -@@ -5422,5 +5484,6 @@ static void __exit bfq_exit(void) - module_init(bfq_init); - module_exit(bfq_exit); - --MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_AUTHOR("Paolo Valente"); - MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 0f51f270469c..c3fcd5ebd735 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -19,15 +19,8 @@ - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - --/* -- * Define an alternative macro to compile cgroups support. This is one -- * of the steps needed to let bfq-mq share the files bfq-sched.c and -- * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -- * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -- * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -- * CONFIG_BFQ_GROUP_IOSCHED, is defined. -- */ --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ -+#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED - #define BFQ_GROUP_IOSCHED_ENABLED - #endif - -@@ -259,8 +252,8 @@ struct bfq_queue { - struct request *next_rq; - /* number of sync and async requests queued */ - int queued[2]; -- /* number of sync and async requests currently allocated */ -- int allocated[2]; -+ /* number of requests currently allocated */ -+ int allocated; - /* number of pending metadata requests */ - int meta_pending; - /* fifo list of requests in sort_list */ -@@ -345,6 +338,8 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ -+ spinlock_t lock; - }; - - /** -@@ -361,6 +356,9 @@ struct bfq_io_cq { - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -+ /* delayed work to exec the body of the the exit_icq handler */ -+ struct work_struct exit_icq_work; -+ - /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to -@@ -402,11 +400,13 @@ enum bfq_device_speed { - /** - * struct bfq_data - per-device data structure. - * -- * All the fields are protected by the @queue lock. -+ * All the fields are protected by @lock. - */ - struct bfq_data { -- /* request queue for the device */ -+ /* device request queue */ - struct request_queue *queue; -+ /* dispatch queue */ -+ struct list_head dispatch; - - /* root bfq_group for the device */ - struct bfq_group *root_group; -@@ -460,8 +460,6 @@ struct bfq_data { - * the queue in service. - */ - struct hrtimer idle_slice_timer; -- /* delayed work to restart dispatching on the request queue */ -- struct work_struct unplug_work; - - /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -@@ -612,6 +610,8 @@ struct bfq_data { - - /* fallback dummy bfqq for extreme OOM conditions */ - struct bfq_queue oom_bfqq; -+ -+ spinlock_t lock; - }; - - enum bfqq_state_flags { -@@ -622,7 +622,6 @@ enum bfqq_state_flags { - * waiting for a request - * without idling the device - */ -- BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -@@ -661,7 +660,6 @@ BFQ_BFQQ_FNS(just_created); - BFQ_BFQQ_FNS(busy); - BFQ_BFQQ_FNS(wait_request); - BFQ_BFQQ_FNS(non_blocking_wait_rq); --BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); - BFQ_BFQQ_FNS(has_short_ttime); - BFQ_BFQQ_FNS(sync); -@@ -692,7 +690,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ -- assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s bfq%d%c %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -@@ -734,7 +731,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ -- assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ -@@ -961,7 +957,6 @@ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) - - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); - static void bfq_put_queue(struct bfq_queue *bfqq); --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); - -From bde5235de2241502c1c00337bd51c96d9b60b6df Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 3 Mar 2017 08:52:40 +0100 -Subject: [PATCH 13/51] Add checks and extra log messages - Part I - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++-- - 1 file changed, 109 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index c963d92a32c2..40eadb3f7073 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -773,6 +773,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - { - int process_refs, io_refs; - -+ lockdep_assert_held(&bfqq->bfqd->lock); -+ - io_refs = bfqq->allocated; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); -@@ -1483,6 +1485,8 @@ static void bfq_add_request(struct request *rq) - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(RQ_BFQQ(rq) != bfqq); - elv_rb_add(&bfqq->sort_list, rq); - - /* -@@ -1491,6 +1495,8 @@ static void bfq_add_request(struct request *rq) - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(!next_rq); -+ BUG_ON(!RQ_BFQQ(next_rq)); -+ BUG_ON(RQ_BFQQ(next_rq) != bfqq); - bfqq->next_rq = next_rq; - - /* -@@ -1615,6 +1621,19 @@ static void bfq_remove_request(struct request_queue *q, - - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { -+ pr_crit("no bfqq! for next rq %p bfqq %p\n", -+ bfqq->next_rq, bfqq); -+ } -+ -+ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); -+ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { -+ pr_crit( -+ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", -+ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); -+ } -+ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); -+ - bfq_updated_next_req(bfqd, bfqq); - } - -@@ -1701,6 +1720,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; -+ bfq_log(bfqd, "request_merge: req %p", __rq); -+ - return ELEVATOR_FRONT_MERGE; - } - -@@ -1721,6 +1742,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); -+ BUG_ON(!RQ_BFQQ(req)); -+ BUG_ON(RQ_BFQQ(req) != bfqq); - elv_rb_add(&bfqq->sort_list, req); - - spin_lock_irq(&bfqd->lock); -@@ -1729,7 +1752,13 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, - bfqd->last_position); - BUG_ON(!next_rq); -+ - bfqq->next_rq = next_rq; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "requests_merged: req %p prev %p next_rq %p bfqq %p", -+ req, prev, next_rq, bfqq); -+ - /* - * If next_rq changes, update both the queue's budget to - * fit the new request and the queue's position in its -@@ -1748,8 +1777,16 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(!RQ_BFQQ(next)); -+ - if (!RB_EMPTY_NODE(&rq->rb_node)) - goto end; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "requests_merged: rq %p next %p bfqq %p next_bfqq %p", -+ rq, next, bfqq, next_bfqq); -+ - spin_lock_irq(&bfqq->bfqd->lock); - - /* -@@ -3847,6 +3884,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - -+ bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", -+ !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); -+ - /* - * Avoiding lock: a race on bfqd->busy_queues should cause at - * most a call to dispatch for nothing -@@ -3865,6 +3905,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ bfq_log(bfqd, -+ "dispatch requests: picked %p from dispatch list", rq); - goto exit; - } - -@@ -3904,7 +3946,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - if (rq) { - rq->rq_flags |= RQF_STARTED; - bfqd->rq_in_driver++; -- } -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %s request %p, rq_in_driver %d", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async", -+ rq, -+ bfqd->rq_in_driver); -+ else -+ bfq_log(bfqd, -+ "dispatched request %p from dispatch list, rq_in_driver %d", -+ rq, bfqd->rq_in_driver); -+ } else -+ bfq_log(bfqd, -+ "returned NULL request, rq_in_driver %d", -+ bfqd->rq_in_driver); - - return rq; - } -@@ -3944,6 +3999,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -@@ -4043,6 +4099,7 @@ static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); - -+ BUG_ON(!bic); - kblockd_schedule_work(&bic->exit_icq_work); - } - -@@ -4057,6 +4114,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - int ioprio_class; - struct bfq_data *bfqd = bfqq->bfqd; - -+ WARN_ON(!bfqd); - if (!bfqd) - return; - -@@ -4404,6 +4462,10 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - -+ assert_spin_locked(&bfqd->lock); -+ -+ bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); -+ - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to -@@ -4420,6 +4482,12 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - */ - new_bfqq->allocated++; - bfqq->allocated--; -+ bfq_log_bfqq(bfqd, bfqq, -+ "insert_request: new allocated %d", bfqq->allocated); -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "insert_request: new_bfqq new allocated %d", -+ bfqq->allocated); -+ - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -@@ -4529,6 +4597,10 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfqd->rq_in_driver--; - bfqq->dispatched--; - -+ bfq_log_bfqq(bfqd, bfqq, -+ "completed_requests: new disp %d, new rq_in_driver %d", -+ bfqq->dispatched, bfqd->rq_in_driver); -+ - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - /* -@@ -4618,6 +4690,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - - static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "put_request_body: allocated %d", bfqq->allocated); -+ BUG_ON(!bfqq->allocated); - bfqq->allocated--; - - bfq_put_queue(bfqq); -@@ -4625,8 +4700,27 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - - static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_data *bfqd = bfqq->bfqd; -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd; -+ struct bfq_io_cq *bic; -+ -+ BUG_ON(!rq); -+ bfqq = RQ_BFQQ(rq); -+ BUG_ON(!bfqq); -+ -+ bic = RQ_BIC(rq); -+ BUG_ON(!bic); -+ -+ bfqd = bfqq->bfqd; -+ BUG_ON(!bfqd); -+ -+ BUG_ON(rq->rq_flags & RQF_QUEUED); -+ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "putting rq %p with %u sects left, STARTED %d", -+ rq, blk_rq_sectors(rq), -+ rq->rq_flags & RQF_STARTED); - - if (rq->rq_flags & RQF_STARTED) - bfqg_stats_update_completion(bfqq_group(bfqq), -@@ -4634,6 +4728,8 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - rq_io_start_time_ns(rq), - rq->cmd_flags); - -+ BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ - if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - -@@ -4655,7 +4751,9 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - * cause any deadlock, even if other locks are already - * (correctly) held by this process. - */ -+ BUG_ON(in_interrupt()); - -+ assert_spin_locked(&bfqd->lock); - if (!RB_EMPTY_NODE(&rq->rb_node)) - bfq_remove_request(q, rq); - bfq_put_rq_priv_body(bfqq); -@@ -4814,7 +4912,9 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - enum bfqq_expiration reason; - unsigned long flags; - -+ BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfqq != bfqd->in_service_queue) { -@@ -4857,6 +4957,8 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - idle_slice_timer); - struct bfq_queue *bfqq = bfqd->in_service_queue; - -+ bfq_log(bfqd, "slice_timer expired"); -+ - /* - * Theoretical race here: the in-service queue can be NULL or - * different from the queue that was idling if a new request -@@ -4909,9 +5011,12 @@ static void bfq_exit_queue(struct elevator_queue *e) - struct bfq_data *bfqd = e->elevator_data; - struct bfq_queue *bfqq, *n; - -+ bfq_log(bfqd, "exit_queue: starting ..."); -+ - hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(bfqd->in_service_queue); -+ BUG_ON(!list_empty(&bfqd->active_list)); - - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { - if (bfqq->bic) /* bfqqs without bic are handled below */ -@@ -4943,6 +5048,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_unlock_irq(&bfqd->lock); - #endif - -+ bfq_log(bfqd, "exit_queue: finished ..."); - kfree(bfqd); - } - - -From 7f59486861e368d25f59d4136cf8e51a75b7edf9 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 9 Feb 2017 10:36:27 +0100 -Subject: [PATCH 14/51] Add lock check in bfq_allow_bio_merge - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 40eadb3f7073..21b876aeba16 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2279,6 +2279,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - if (!bic) - return false; - -+ assert_spin_locked(&bfqd->lock); - bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - -From a2dd19a4d95cf401268c144c79ce549c7fc4bbca Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 7 Feb 2017 15:14:29 +0100 -Subject: [PATCH 15/51] bfq-mq: execute exit_icq operations immediately - -Exploting Omar's patch that removes the taking of the queue lock in -put_io_context_active, this patch moves back the operation of the bfq_exit_icq -hook from a deferred work to the body of the function. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 34 +++------------------------------- - block/bfq-mq.h | 3 --- - 2 files changed, 3 insertions(+), 34 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 21b876aeba16..1deb79a47181 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4080,28 +4080,13 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - } - } - --static void bfq_exit_icq_body(struct work_struct *work) --{ -- struct bfq_io_cq *bic = -- container_of(work, struct bfq_io_cq, exit_icq_work); -- -- bfq_exit_icq_bfqq(bic, true); -- bfq_exit_icq_bfqq(bic, false); --} -- --static void bfq_init_icq(struct io_cq *icq) --{ -- struct bfq_io_cq *bic = icq_to_bic(icq); -- -- INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); --} -- - static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); - - BUG_ON(!bic); -- kblockd_schedule_work(&bic->exit_icq_work); -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); - } - - /* -@@ -5019,21 +5004,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - BUG_ON(bfqd->in_service_queue); - BUG_ON(!list_empty(&bfqd->active_list)); - -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -- if (bfqq->bic) /* bfqqs without bic are handled below */ -- cancel_work_sync(&bfqq->bic->exit_icq_work); -- } -- - spin_lock_irq(&bfqd->lock); -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -- /* -- * Make sure that deferred exit_icq_work completes -- * without errors for bfq_queues without bic -- */ -- if (!bfqq->bic) -- bfqq->bfqd = NULL; -- } - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -@@ -5471,7 +5444,6 @@ static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .get_rq_priv = bfq_get_rq_private, - .put_rq_priv = bfq_put_rq_private, -- .init_icq = bfq_init_icq, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index c3fcd5ebd735..23744b246db6 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -356,9 +356,6 @@ struct bfq_io_cq { - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -- /* delayed work to exec the body of the the exit_icq handler */ -- struct work_struct exit_icq_work; -- - /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to - -From ab7e78a0ff095101de74e700f8743295a500bb20 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 21 Feb 2017 10:26:22 +0100 -Subject: [PATCH 16/51] Unnest request-queue and ioc locks from scheduler locks - -In some bio-merging functions, the request-queue lock needs to be -taken, to lookup for the bic associated with the process that issued -the bio that may need to be merged. In addition, put_io_context must -be invoked in some other functions, and put_io_context may cause the -lock of the involved ioc to be taken. In both cases, these extra -request-queue or ioc locks are taken, or might be taken, while the -scheduler lock is being held. In this respect, there are other code -paths, in part external to bfq-mq, in which the same locks are taken -(nested) in the opposite order, i.e., it is the scheduler lock to be -taken while the request-queue or the ioc lock is being held. This -leads to circular deadlocks. - -This commit addresses this issue by modifying the logic of the above -functions, so as to let the lookup and put_io_context be performed, -and thus the extra locks be taken, outside the critical sections -protected by the scheduler lock. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 9 ++ - block/bfq-mq-iosched.c | 264 ++++++++++++++++++++++++++++---------------- - block/bfq-mq.h | 25 ++++- - block/bfq-sched.c | 11 ++ - 4 files changed, 213 insertions(+), 96 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 8a73de76f32b..cf59eeb7f08e 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -716,6 +716,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - struct bfq_group *bfqg; - struct bfq_data *bfqd; - struct bfq_entity *entity; -+#ifdef BFQ_MQ -+ unsigned long flags; -+#endif - int i; - - BUG_ON(!pd); -@@ -729,6 +732,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - if (!entity) /* root group */ - return; - -+#ifdef BFQ_MQ -+ spin_lock_irqsave(&bfqd->lock, flags); -+#endif - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -766,6 +772,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - __bfq_deactivate_entity(entity, false); - bfq_put_async_queues(bfqd, bfqg); - -+#ifdef BFQ_MQ -+ bfq_unlock_put_ioc_restore(bfqd, flags); -+#endif - /* - * @blkg is going offline and will be ignored by - * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 1deb79a47181..69ef3761c95d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -233,6 +233,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - return NULL; - } - -+#define BFQ_MQ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" - -@@ -1564,15 +1565,9 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio, - struct request_queue *q) - { -- struct task_struct *tsk = current; -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq; -+ struct bfq_queue *bfqq = bfqd->bio_bfqq; - -- bic = bfq_bic_lookup(bfqd, tsk->io_context, q); -- if (!bic) -- return NULL; - -- bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); - -@@ -1693,9 +1688,26 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *free = NULL; -+ /* -+ * bfq_bic_lookup grabs the queue_lock: invoke it now and -+ * store its return value for later use, to avoid nesting -+ * queue_lock inside the bfqd->lock. We assume that the bic -+ * returned by bfq_bic_lookup does not go away before -+ * bfqd->lock is taken. -+ */ -+ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); - bool ret; - - spin_lock_irq(&bfqd->lock); -+ -+ if (bic) -+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ else -+ bfqd->bio_bfqq = NULL; -+ bfqd->bio_bic = bic; -+ /* Set next flag just for testing purposes */ -+ bfqd->bio_bfqq_set = true; -+ - ret = blk_mq_sched_try_merge(q, bio, &free); - - /* -@@ -1706,6 +1718,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - */ - if (free) - blk_mq_free_request(free); -+ bfqd->bio_bfqq_set = false; - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -2261,8 +2274,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - { - struct bfq_data *bfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq, *new_bfqq; -+ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. -@@ -2273,31 +2285,40 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - /* - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. -- * Queue lock is held here. - */ -- bic = bfq_bic_lookup(bfqd, current->io_context, q); -- if (!bic) -+ if (!bfqq) - return false; - -- assert_spin_locked(&bfqd->lock); -- bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. - */ -- if (bfqq) { -- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -- if (new_bfqq) { -- bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -- /* -- * If we get here, the bio will be queued in the -- * shared queue, i.e., new_bfqq, so use new_bfqq -- * to decide whether bio and rq can be merged. -- */ -- bfqq = new_bfqq; -- } -- } -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ /* -+ * bic still points to bfqq, then it has not yet been -+ * redirected to some other bfq_queue, and a queue -+ * merge beween bfqq and new_bfqq can be safely -+ * fulfillled, i.e., bic can be redirected to new_bfqq -+ * and bfqq can be put. -+ */ -+ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, -+ new_bfqq); -+ /* -+ * If we get here, bio will be queued into new_queue, -+ * so use new_bfqq to decide whether bio and rq can be -+ * merged. -+ */ -+ bfqq = new_bfqq; - -+ /* -+ * Change also bqfd->bio_bfqq, as -+ * bfqd->bio_bic now points to new_bfqq, and -+ * this function may be invoked again (and then may -+ * use again bqfd->bio_bfqq). -+ */ -+ bfqd->bio_bfqq = bfqq; -+ } - return bfqq == RQ_BFQQ(rq); - } - -@@ -3965,14 +3986,43 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - -+/* -+ * Next two functions release bfqd->lock and put the io context -+ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -+ * to take an ioc->lock while the scheduler lock is being held. -+ */ -+static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irq(&bfqd->lock); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -+ unsigned long flags) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; - - spin_lock_irq(&bfqd->lock); -+ - rq = __bfq_dispatch_request(hctx); -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc(bfqd); - - return rq; - } -@@ -3981,7 +4031,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * Task holds one reference to the queue, dropped when task exits. Each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * -- * Queue lock must be held here. Recall not to use bfqq after calling -+ * Scheduler lock must be held here. Recall not to use bfqq after calling - * this function on it. - */ - static void bfq_put_queue(struct bfq_queue *bfqq) -@@ -4066,17 +4116,23 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - - if (bfqq && bfqd) { -- spin_lock_irq(&bfqd->lock); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); - /* -- * If the bic is using a shared queue, put the reference -- * taken on the io_context when the bic started using a -- * shared bfq_queue. -+ * If the bic is using a shared queue, put the -+ * reference taken on the io_context when the bic -+ * started using a shared bfq_queue. This put cannot -+ * make ioc->ref_count reach 0, then no ioc->lock -+ * risks to be taken (leading to possible deadlock -+ * scenarios). - */ - if (is_sync && bfq_bfqq_coop(bfqq)) - put_io_context(bic->icq.ioc); -+ - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - } - } - -@@ -4183,8 +4239,6 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -- spin_lock_init(&bfqq->lock); -- - bfqq->ref = 0; - bfqq->bfqd = bfqd; - -@@ -4476,6 +4530,14 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); -+ /* -+ * If the bic associated with the process -+ * issuing this request still points to bfqq -+ * (and thus has not been already redirected -+ * to new_bfqq or even some other bfq_queue), -+ * then complete the merge and redirect it to -+ * new_bfqq. -+ */ - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -@@ -4498,14 +4560,17 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - } - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -- bool at_head) -+ bool at_head) - { - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - - spin_lock_irq(&bfqd->lock); -- if (blk_mq_sched_try_insert_merge(q, rq)) -- goto done; -+ if (blk_mq_sched_try_insert_merge(q, rq)) { -+ spin_unlock_irq(&bfqd->lock); -+ return; -+ } -+ - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4530,8 +4595,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --done: -- spin_unlock_irq(&bfqd->lock); -+ -+ bfq_unlock_put_ioc(bfqd); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -@@ -4724,7 +4789,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); - -- spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -@@ -4732,10 +4797,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. -- * Fortunately, this situation occurs only in process -- * context, so taking the scheduler lock does not -- * cause any deadlock, even if other locks are already -- * (correctly) held by this process. -+ * This situation seems to occur only in process -+ * context, as a consequence of a merge. In the -+ * current version of the code, this implies that the -+ * lock is held. - */ - BUG_ON(in_interrupt()); - -@@ -4758,8 +4823,6 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - -- put_io_context(bic->icq.ioc); -- - if (bfqq_process_refs(bfqq) == 1) { - bfqq->pid = current->pid; - bfq_clear_bfqq_coop(bfqq); -@@ -4775,6 +4838,41 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - return NULL; - } - -+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct bio *bio, -+ bool split, bool is_sync, -+ bool *new_queue) -+{ -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ -+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) -+ return bfqq; -+ -+ if (new_queue) -+ *new_queue = true; -+ -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ else { -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ -+ return bfqq; -+} -+ - /* - * Allocate bfq data structures associated with this request. - */ -@@ -4786,6 +4884,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - bool bfqq_already_existing = false, split = false; -+ bool new_queue = false; - - spin_lock_irq(&bfqd->lock); - -@@ -4796,42 +4895,10 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfq_bic_update_cgroup(bic, bio); - --new_queue: -- bfqq = bic_to_bfqq(bic, is_sync); -- if (!bfqq || bfqq == &bfqd->oom_bfqq) { -- if (bfqq) -- bfq_put_queue(bfqq); -- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -- BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, -+ &new_queue); - -- bic_set_bfqq(bic, bfqq, is_sync); -- if (split && is_sync) { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: was_in_list %d " -- "was_in_large_burst %d " -- "large burst in progress %d", -- bic->was_in_burst_list, -- bic->saved_in_large_burst, -- bfqd->large_burst); -- -- if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: marking in " -- "large burst"); -- bfq_mark_bfqq_in_large_burst(bfqq); -- } else { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: clearing in " -- "large burst"); -- bfq_clear_bfqq_in_large_burst(bfqq); -- if (bic->was_in_burst_list) -- hlist_add_head(&bfqq->burst_list_node, -- &bfqd->burst_list); -- } -- bfqq->split_time = jiffies; -- } -- } else { -+ if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -@@ -4841,9 +4908,19 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -- split = true; -+ /* -+ * A reference to bic->icq.ioc needs to be -+ * released after a queue split. Do not do it -+ * immediately, to not risk to possibly take -+ * an ioc->lock while holding the scheduler -+ * lock. -+ */ -+ bfqd->ioc_to_put = bic->icq.ioc; -+ - if (!bfqq) -- goto new_queue; -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -+ true, is_sync, -+ NULL); - else - bfqq_already_existing = true; - } -@@ -4861,18 +4938,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - /* - * If a bfq_queue has only one process reference, it is owned -- * by only one bfq_io_cq: we can set the bic field of the -- * bfq_queue to the address of that structure. Also, if the -- * queue has just been split, mark a flag so that the -- * information is available to the other scheduler hooks. -+ * by only this bic: we can then set bfqq->bic = bic. in -+ * addition, if the queue has also just been split, we have to -+ * resume its state. - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; -- if (split) { -+ if (bfqd->ioc_to_put) { /* if true, then there has been a split */ - /* -- * If the queue has just been split from a shared -- * queue, restore the idle window and the possible -- * weight raising period. -+ * The queue has just been split from a shared -+ * queue: restore the idle window and the -+ * possible weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); -@@ -4882,7 +4958,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc(bfqd); - - return 0; - -@@ -4929,7 +5005,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - bfq_bfqq_expire(bfqd, bfqq, true, reason); - - schedule_dispatch: -- spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - bfq_schedule_dispatch(bfqd); - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 23744b246db6..bd83f1c02573 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -338,8 +338,6 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -- -- spinlock_t lock; - }; - - /** -@@ -609,6 +607,29 @@ struct bfq_data { - struct bfq_queue oom_bfqq; - - spinlock_t lock; -+ -+ /* -+ * bic associated with the task issuing current bio for -+ * merging. This and the next field are used as a support to -+ * be able to perform the bic lookup, needed by bio-merge -+ * functions, before the scheduler lock is taken, and thus -+ * avoid taking the request-queue lock while the scheduler -+ * lock is being held. -+ */ -+ struct bfq_io_cq *bio_bic; -+ /* bfqq associated with the task issuing current bio for merging */ -+ struct bfq_queue *bio_bfqq; -+ /* Extra flag used only for TESTING */ -+ bool bio_bfqq_set; -+ -+ /* -+ * io context to put right after bfqd->lock is released. This -+ * filed is used to perform put_io_context, when needed, to -+ * after the scheduler lock has been released, and thus -+ * prevent an ioc->lock from being possibly taken while the -+ * scheduler lock is being held. -+ */ -+ struct io_context *ioc_to_put; - }; - - enum bfqq_state_flags { -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index b54a638186e3..a5c8b4acd33c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1905,7 +1905,18 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *entity = in_serv_entity; - - if (bfqd->in_service_bic) { -+#ifdef BFQ_MQ -+ /* -+ * Schedule the release of a reference to -+ * bfqd->in_service_bic->icq.ioc to right after the -+ * scheduler lock is released. This ioc is not -+ * released immediately, to not risk to possibly take -+ * an ioc->lock while holding the scheduler lock. -+ */ -+ bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; -+#else - put_io_context(bfqd->in_service_bic->icq.ioc); -+#endif - bfqd->in_service_bic = NULL; - } - - -From 84cc7140cb4f0574710625f51abbb076a1dd2920 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 3 Mar 2017 09:31:14 +0100 -Subject: [PATCH 17/51] Add checks and extra log messages - Part II - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 42 ++++++++++++++++++++++++++++++++++++++++-- - block/bfq-sched.c | 1 + - 2 files changed, 41 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 69ef3761c95d..5707d42b160d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1567,6 +1567,7 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - { - struct bfq_queue *bfqq = bfqd->bio_bfqq; - -+ BUG_ON(!bfqd->bio_bfqq_set); - - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -@@ -1719,6 +1720,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - if (free) - blk_mq_free_request(free); - bfqd->bio_bfqq_set = false; -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -1781,6 +1783,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - } - } -@@ -1824,6 +1827,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - bfq_remove_request(q, next); - -+ BUG_ON(bfqq->bfqd->ioc_to_put); - spin_unlock_irq(&bfqq->bfqd->lock); - end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -@@ -2195,9 +2199,11 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - { - bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", - (unsigned long) new_bfqq->pid); -+ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); - /* Save weight raising and idle window of the merged queues */ - bfq_bfqq_save_state(bfqq); - bfq_bfqq_save_state(new_bfqq); -+ - if (bfq_bfqq_IO_bound(bfqq)) - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); -@@ -2276,6 +2282,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - bool is_sync = op_is_sync(bio->bi_opf); - struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; - -+ assert_spin_locked(&bfqd->lock); - /* - * Disallow merge of a sync bio into an async request. - */ -@@ -2286,6 +2293,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. - */ -+ BUG_ON(!bfqd->bio_bfqq_set); - if (!bfqq) - return false; - -@@ -2294,6 +2302,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * of the queues of possible cooperating processes. - */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ BUG_ON(new_bfqq == bfqq); - if (new_bfqq) { - /* - * bic still points to bfqq, then it has not yet been -@@ -4040,6 +4049,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -+ assert_spin_locked(&bfqq->bfqd->lock); -+ - BUG_ON(bfqq->ref <= 0); - - if (bfqq->bfqd) -@@ -4119,6 +4130,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); - /* - * If the bic is using a shared queue, put the - * reference taken on the io_context when the bic -@@ -4567,10 +4579,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - return; - } - -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4785,6 +4799,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); - - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); -@@ -4855,13 +4870,28 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - if (bfqq) - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ - if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: marking in " -+ "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); -- else { -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: clearing in " -+ "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, -@@ -4897,10 +4927,12 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); -+ BUG_ON(bfqd->ioc_to_put); - - if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ BUG_ON(!is_sync); - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); - - /* Update bic before losing reference to bfqq */ -@@ -4923,6 +4955,9 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - NULL); - else - bfqq_already_existing = true; -+ -+ BUG_ON(!bfqq); -+ BUG_ON(bfqq == &bfqd->oom_bfqq); - } - } - -@@ -4976,6 +5011,8 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - - BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); -+ - bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); - -@@ -5083,6 +5120,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_lock_irq(&bfqd->lock); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a5c8b4acd33c..85e59eeb3569 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1906,6 +1906,7 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - - if (bfqd->in_service_bic) { - #ifdef BFQ_MQ -+ BUG_ON(bfqd->ioc_to_put); - /* - * Schedule the release of a reference to - * bfqd->in_service_bic->icq.ioc to right after the - -From 3d54cb804f1db2e08ce4a6cc335868538542f587 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 22 Feb 2017 11:30:01 +0100 -Subject: [PATCH 18/51] Fix unbalanced increment of rq_in_driver - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++++--------- - 1 file changed, 43 insertions(+), 9 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 5707d42b160d..9cbcb8d43d81 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -3936,9 +3936,45 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ - bfq_log(bfqd, - "dispatch requests: picked %p from dispatch list", rq); -- goto exit; -+ bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ /* -+ * Increment counters here, because this -+ * dispatch does not follow the standard -+ * dispatch flow (where counters are -+ * incremented) -+ */ -+ bfqq->dispatched++; -+ -+ goto inc_in_driver_start_rq; -+ } -+ -+ /* -+ * We exploit the put_rq_private hook to decrement -+ * rq_in_driver, but put_rq_private will not be -+ * invoked on this request. So, to avoid unbalance, -+ * just start this request, without incrementing -+ * rq_in_driver. As a negative consequence, -+ * rq_in_driver is deceptively lower than it should be -+ * while this request is in service. This may cause -+ * bfq_schedule_dispatch to be invoked uselessly. -+ * -+ * As for implementing an exact solution, the -+ * put_request hook, if defined, is probably invoked -+ * also on this request. So, by exploiting this hook, -+ * we could 1) increment rq_in_driver here, and 2) -+ * decrement it in put_request. Such a solution would -+ * let the value of the counter be always accurate, -+ * but it would entail using an extra interface -+ * function. This cost seems higher than the benefit, -+ * being the frequency of non-elevator-private -+ * requests very low. -+ */ -+ goto start_rq; - } - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -@@ -3973,10 +4009,12 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); --exit: -+ - if (rq) { -- rq->rq_flags |= RQF_STARTED; -+ inc_in_driver_start_rq: - bfqd->rq_in_driver++; -+ start_rq: -+ rq->rq_flags |= RQF_STARTED; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "dispatched %s request %p, rq_in_driver %d", -@@ -3992,6 +4030,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - "returned NULL request, rq_in_driver %d", - bfqd->rq_in_driver); - -+exit: - return rq; - } - -@@ -4591,15 +4630,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); -- -- if (bfqq) -- bfqq->dispatched++; - } else { - __bfq_insert_request(bfqd, rq); - -@@ -4966,7 +5000,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - "get_request: new allocated %d", bfqq->allocated); - - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; - -From 7ba977d696b239569b4cd233aebc99e136ecf487 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 3 Mar 2017 09:39:35 +0100 -Subject: [PATCH 19/51] Add checks and extra log messages - Part III - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 9cbcb8d43d81..24b529a2edc7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4630,10 +4630,21 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "insert_request %p in disp: at_head %d", -+ rq, at_head); -+ else -+ bfq_log(bfqd, -+ "insert_request %p in disp: at_head %d", -+ rq, at_head); - } else { - __bfq_insert_request(bfqd, rq); - - -From c94e47b2908600b8ba89f84b0ac7febddd313141 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 17 Feb 2017 14:28:02 +0100 -Subject: [PATCH 20/51] TESTING: Check wrong invocation of merge and - put_rq_priv functions - -Check that merge functions are not invoked on requests queued in the -dispatch queue, and that neither put_rq_private is invoked on these -requests if, in addition, they have not passed through get_rq_private. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 22 ++++++++++++++++++++++ - include/linux/blkdev.h | 2 ++ - 2 files changed, 24 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 24b529a2edc7..b4d40bb712d2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1746,6 +1746,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - static void bfq_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) - { -+ BUG_ON(req->rq_flags & RQF_DISP_LIST); -+ - if (type == ELEVATOR_FRONT_MERGE && - rb_prev(&req->rb_node) && - blk_rq_pos(req) < -@@ -1795,6 +1797,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - BUG_ON(!RQ_BFQQ(rq)); - BUG_ON(!RQ_BFQQ(next)); -+ BUG_ON(rq->rq_flags & RQF_DISP_LIST); -+ BUG_ON(next->rq_flags & RQF_DISP_LIST); - - if (!RB_EMPTY_NODE(&rq->rb_node)) - goto end; -@@ -3936,6 +3940,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ rq->rq_flags &= ~RQF_DISP_LIST; - - bfq_log(bfqd, - "dispatch requests: picked %p from dispatch list", rq); -@@ -3950,6 +3955,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - */ - bfqq->dispatched++; - -+ /* -+ * TESTING: reset DISP_LIST flag, because: 1) -+ * this rq this request has passed through -+ * get_rq_private, 2) then it will have -+ * put_rq_private invoked on it, and 3) in -+ * put_rq_private we use this flag to check -+ * that put_rq_private is not invoked on -+ * requests for which get_rq_private has been -+ * invoked. -+ */ -+ rq->rq_flags &= ~RQF_DISP_LIST; - goto inc_in_driver_start_rq; - } - -@@ -4637,6 +4653,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); - -+ rq->rq_flags |= RQF_DISP_LIST; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "insert_request %p in disp: at_head %d", -@@ -4824,6 +4841,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - bfqd = bfqq->bfqd; - BUG_ON(!bfqd); - -+ if (rq->rq_flags & RQF_DISP_LIST) { -+ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); -+ BUG(); -+ } - BUG_ON(rq->rq_flags & RQF_QUEUED); - BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); - -@@ -5015,6 +5036,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -+ rq->rq_flags &= ~RQF_DISP_LIST; - - /* - * If a bfq_queue has only one process reference, it is owned -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 10f892ca585d..0048e59e6d07 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; - /* Look at ->special_vec for the actual data payload instead of the - bio chain. */ - #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -+/* DEBUG: rq in bfq-mq dispatch list */ -+#define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ - -From 49206f9052d13c96d49dbc36c612bed41b2d6552 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 25 Feb 2017 17:38:05 +0100 -Subject: [PATCH 21/51] Complete support for cgroups - -This commit completes cgroups support for bfq-mq. In particular, it deals with -a sort of circular dependency introduced in blk-mq: the function -blkcg_activate_policy, invoked during scheduler initialization, triggers the -invocation of the has_work scheduler hook (before the init function is -finished). To adress this issue, this commit moves the invocation of -blkcg_activate_policy after the initialization of all the fields that could be -initialized before invoking blkcg_activate_policy itself. This enables has_work -to correctly return false, and thus to prevent the blk-mq stack from invoking -further scheduler hooks before the init function is finished. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/Kconfig.iosched | 9 +++++ - block/bfq-mq-iosched.c | 108 ++++++++++++++++++++++++++++--------------------- - block/bfq-mq.h | 2 +- - 3 files changed, 72 insertions(+), 47 deletions(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 2d94af3d8b0a..299a6861fb90 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -106,6 +106,15 @@ config MQ_IOSCHED_BFQ - guarantees a low latency to interactive and soft real-time - applications. Details in Documentation/block/bfq-iosched.txt - -+config MQ_BFQ_GROUP_IOSCHED -+ bool "BFQ-MQ hierarchical scheduling support" -+ depends on MQ_IOSCHED_BFQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-MQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b4d40bb712d2..02a1e7fd0ea4 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -88,7 +88,6 @@ - #include "blk-mq.h" - #include "blk-mq-tag.h" - #include "blk-mq-sched.h" --#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq-mq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ -@@ -233,15 +232,6 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - return NULL; - } - --#define BFQ_MQ --#include "bfq-sched.c" --#include "bfq-cgroup-included.c" -- --#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) --#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -- --#define bfq_sample_valid(samples) ((samples) > 80) -- - /* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. -@@ -255,6 +245,43 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - } - - /* -+ * Next two functions release bfqd->lock and put the io context -+ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -+ * to take an ioc->lock while the scheduler lock is being held. -+ */ -+static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irq(&bfqd->lock); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -+ unsigned long flags) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+#define BFQ_MQ -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closesr to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. -@@ -4050,34 +4077,6 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - --/* -- * Next two functions release bfqd->lock and put the io context -- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -- * to take an ioc->lock while the scheduler lock is being held. -- */ --static void bfq_unlock_put_ioc(struct bfq_data *bfqd) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irq(&bfqd->lock); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- --static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -- unsigned long flags) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irqrestore(&bfqd->lock, flags); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -@@ -5239,6 +5238,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - } - eq->elevator_data = bfqd; - -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow -@@ -5261,12 +5264,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->oom_bfqq.entity.prio_changed = 1; - - bfqd->queue = q; -- -- bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -- if (!bfqd->root_group) -- goto out_free; -- bfq_init_root_group(bfqd->root_group, bfqd); -- bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ INIT_LIST_HEAD(&bfqd->dispatch); - - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); -@@ -5324,9 +5322,27 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->device_speed = BFQ_BFQD_FAST; - - spin_lock_init(&bfqd->lock); -- INIT_LIST_HEAD(&bfqd->dispatch); - -- q->elevator = eq; -+ /* -+ * The invocation of the next bfq_create_group_hierarchy -+ * function is the head of a chain of function calls -+ * (bfq_create_group_hierarchy->blkcg_activate_policy-> -+ * blk_mq_freeze_queue) that may lead to the invocation of the -+ * has_work hook function. For this reason, -+ * bfq_create_group_hierarchy is invoked only after all -+ * scheduler data has been initialized, apart from the fields -+ * that can be initialized only after invoking -+ * bfq_create_group_hierarchy. This, in particular, enables -+ * has_work to correctly return false. Of course, to avoid -+ * other inconsistencies, the blk-mq stack must then refrain -+ * from invoking further scheduler hooks before this init -+ * function is finished. -+ */ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - - return 0; - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index bd83f1c02573..2c81c02bccc4 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -20,7 +20,7 @@ - #include <linux/blk-cgroup.h> - - /* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ --#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED -+#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED - #define BFQ_GROUP_IOSCHED_ENABLED - #endif - - -From 62d12db23ce14d2716b5cff7d2635fbc817b96d0 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 17 Mar 2017 06:15:18 +0100 -Subject: [PATCH 22/51] Remove all get and put of I/O contexts - -When a bfq queue is set in service and when it is merged, a reference -to the I/O context associated with the queue is taken. This reference -is then released when the queue is deselected from service or -split. More precisely, the release of the reference is postponed to -when the scheduler lock is released, to avoid nesting between the -scheduler and the I/O-context lock. In fact, such nesting would lead -to deadlocks, because of other code paths that take the same locks in -the opposite order. This postponing of I/O-context releases does -complicate code. - -This commit addresses this issue by modifying involved operations in -such a way to not need to get the above I/O-context references any -more. Then it also removes any get and release of these references. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 2 +- - block/bfq-mq-iosched.c | 127 ++++++++------------------------------------ - block/bfq-mq.h | 11 ---- - block/bfq-sched.c | 17 ------ - 4 files changed, 22 insertions(+), 135 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index cf59eeb7f08e..dfacca799b5e 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -773,7 +773,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - bfq_put_async_queues(bfqd, bfqg); - - #ifdef BFQ_MQ -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - #endif - /* - * @blkg is going offline and will be ignored by -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 02a1e7fd0ea4..8e7589d3280f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -244,34 +244,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - } - } - --/* -- * Next two functions release bfqd->lock and put the io context -- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -- * to take an ioc->lock while the scheduler lock is being held. -- */ --static void bfq_unlock_put_ioc(struct bfq_data *bfqd) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irq(&bfqd->lock); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- --static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -- unsigned long flags) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irqrestore(&bfqd->lock, flags); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- - #define BFQ_MQ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" -@@ -1747,7 +1719,6 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - if (free) - blk_mq_free_request(free); - bfqd->bio_bfqq_set = false; -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -1812,7 +1783,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - } - } -@@ -1858,7 +1828,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - bfq_remove_request(q, next); - -- BUG_ON(bfqq->bfqd->ioc_to_put); - spin_unlock_irq(&bfqq->bfqd->lock); - end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -@@ -2035,20 +2004,18 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - * first time that the requests of some process are redirected to - * it. - * -- * We redirect bfqq to new_bfqq and not the opposite, because we -- * are in the context of the process owning bfqq, hence we have -- * the io_cq of this process. So we can immediately configure this -- * io_cq to redirect the requests of the process to new_bfqq. -+ * We redirect bfqq to new_bfqq and not the opposite, because -+ * we are in the context of the process owning bfqq, thus we -+ * have the io_cq of this process. So we can immediately -+ * configure this io_cq to redirect the requests of the -+ * process to new_bfqq. In contrast, the io_cq of new_bfqq is -+ * not available any more (new_bfqq->bic == NULL). - * -- * NOTE, even if new_bfqq coincides with the in-service queue, the -- * io_cq of new_bfqq is not available, because, if the in-service -- * queue is shared, bfqd->in_service_bic may not point to the -- * io_cq of the in-service queue. -- * Redirecting the requests of the process owning bfqq to the -- * currently in-service queue is in any case the best option, as -- * we feed the in-service queue with new requests close to the -- * last request served and, by doing so, hopefully increase the -- * throughput. -+ * Anyway, even in case new_bfqq coincides with the in-service -+ * queue, redirecting requests the in-service queue is the -+ * best option, as we feed the in-service queue with new -+ * requests close to the last request served and, by doing so, -+ * are likely to increase the throughput. - */ - bfqq->new_bfqq = new_bfqq; - new_bfqq->ref += process_refs; -@@ -2147,13 +2114,13 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ wr_from_too_long(in_service_bfqq) - && likely(in_service_bfqq == &bfqd->oom_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have tried merge with in-service-queue, but wr"); - -- if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ if (!in_service_bfqq || in_service_bfqq == bfqq -+ || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - -@@ -2214,16 +2181,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - --static void bfq_get_bic_reference(struct bfq_queue *bfqq) --{ -- /* -- * If bfqq->bic has a non-NULL value, the bic to which it belongs -- * is about to begin using a shared bfq_queue. -- */ -- if (bfqq->bic) -- atomic_long_inc(&bfqq->bic->icq.ioc->refcount); --} -- - static void - bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -@@ -2280,12 +2237,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfqd->wr_busy_queues); - - /* -- * Grab a reference to the bic, to prevent it from being destroyed -- * before being possibly touched by a bfq_split_bfqq(). -- */ -- bfq_get_bic_reference(bfqq); -- bfq_get_bic_reference(new_bfqq); -- /* - * Merge queues (that is, let bic redirect its requests to new_bfqq) - */ - bic_set_bfqq(bic, new_bfqq, 1); -@@ -2472,16 +2423,10 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) - static void bfq_arm_slice_timer(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq = bfqd->in_service_queue; -- struct bfq_io_cq *bic; - u32 sl; - - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - -- /* Processes have exited, don't wait. */ -- bic = bfqd->in_service_bic; -- if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -- return; -- - bfq_mark_bfqq_wait_request(bfqq); - - /* -@@ -3922,11 +3867,6 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, - bfq_bfqq_budget_left(bfqq), - bfqq->dispatched); - -- if (!bfqd->in_service_bic) { -- atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -- bfqd->in_service_bic = RQ_BIC(rq); -- } -- - /* - * Expire bfqq, pretending that its budget expired, if bfqq - * belongs to CLASS_IDLE and other queues are waiting for -@@ -4085,7 +4025,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - spin_lock_irq(&bfqd->lock); - - rq = __bfq_dispatch_request(hctx); -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - - return rq; - } -@@ -4184,21 +4124,10 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); -- /* -- * If the bic is using a shared queue, put the -- * reference taken on the io_context when the bic -- * started using a shared bfq_queue. This put cannot -- * make ioc->ref_count reach 0, then no ioc->lock -- * risks to be taken (leading to possible deadlock -- * scenarios). -- */ -- if (is_sync && bfq_bfqq_coop(bfqq)) -- put_io_context(bic->icq.ioc); - - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - } - } - -@@ -4633,12 +4562,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - return; - } - -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4671,7 +4598,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - } - } - -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -@@ -4864,12 +4791,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); - - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); - -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -@@ -4992,7 +4918,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); -- BUG_ON(bfqd->ioc_to_put); - - if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ -@@ -5005,14 +4930,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -- /* -- * A reference to bic->icq.ioc needs to be -- * released after a queue split. Do not do it -- * immediately, to not risk to possibly take -- * an ioc->lock while holding the scheduler -- * lock. -- */ -- bfqd->ioc_to_put = bic->icq.ioc; - - if (!bfqq) - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -@@ -5045,7 +4962,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; -- if (bfqd->ioc_to_put) { /* if true, then there has been a split */ -+ if (split) { - /* - * The queue has just been split from a shared - * queue: restore the idle window and the -@@ -5059,7 +4976,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - - return 0; - -@@ -5077,7 +4994,6 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - - BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); - - bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); -@@ -5108,7 +5024,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - bfq_bfqq_expire(bfqd, bfqq, true, reason); - - schedule_dispatch: -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - bfq_schedule_dispatch(bfqd); - } - -@@ -5186,7 +5102,6 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_lock_irq(&bfqd->lock); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 2c81c02bccc4..36ee24a87dda 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -458,8 +458,6 @@ struct bfq_data { - - /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -- /* bfq_io_cq (bic) associated with the @in_service_queue */ -- struct bfq_io_cq *in_service_bic; - - /* on-disk position of the last served request */ - sector_t last_position; -@@ -621,15 +619,6 @@ struct bfq_data { - struct bfq_queue *bio_bfqq; - /* Extra flag used only for TESTING */ - bool bio_bfqq_set; -- -- /* -- * io context to put right after bfqd->lock is released. This -- * filed is used to perform put_io_context, when needed, to -- * after the scheduler lock has been released, and thus -- * prevent an ioc->lock from being possibly taken while the -- * scheduler lock is being held. -- */ -- struct io_context *ioc_to_put; - }; - - enum bfqq_state_flags { -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 85e59eeb3569..9c4e6797d8c9 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1904,23 +1904,6 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; - -- if (bfqd->in_service_bic) { --#ifdef BFQ_MQ -- BUG_ON(bfqd->ioc_to_put); -- /* -- * Schedule the release of a reference to -- * bfqd->in_service_bic->icq.ioc to right after the -- * scheduler lock is released. This ioc is not -- * released immediately, to not risk to possibly take -- * an ioc->lock while holding the scheduler lock. -- */ -- bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; --#else -- put_io_context(bfqd->in_service_bic->icq.ioc); --#endif -- bfqd->in_service_bic = NULL; -- } -- - bfq_clear_bfqq_wait_request(in_serv_bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; - -From 1521ad11f8684cf0a1b7249249cd406fee50da6d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 29 Mar 2017 18:41:46 +0200 -Subject: [PATCH 23/51] BUGFIX: Remove unneeded and deadlock-causing lock in - request_merged - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 -- - 1 file changed, 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8e7589d3280f..bb046335ff4f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1761,7 +1761,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - BUG_ON(RQ_BFQQ(req) != bfqq); - elv_rb_add(&bfqq->sort_list, req); - -- spin_lock_irq(&bfqd->lock); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -@@ -1783,7 +1782,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -- spin_unlock_irq(&bfqd->lock); - } - } - - -From 9136b4c953918ea937254c57cfb787b55b5bc2c6 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 29 Mar 2017 18:55:30 +0200 -Subject: [PATCH 24/51] Fix wrong unlikely - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bb046335ff4f..3ae9bd424b3f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4917,7 +4917,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); - -- if (unlikely(!new_queue)) { -+ if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - BUG_ON(!is_sync); - -From 8e05f722f19645f2278f6962368ca3b7c2a81c9c Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 12 May 2017 09:51:18 +0200 -Subject: [PATCH 25/51] Change cgroup params prefix to bfq-mq for bfq-mq - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 54 ++++++++++++++++++++++++++------------------- - 1 file changed, 31 insertions(+), 23 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index dfacca799b5e..9e9b0a09e26f 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -995,9 +995,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - return blkg_to_bfqg(bfqd->queue->root_blkg); - } - -+#ifdef BFQ_MQ -+#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param -+#else -+#define BFQ_CGROUP_FNAME(param) "bfq."#param -+#endif -+ - static struct cftype bfq_blkcg_legacy_files[] = { - { -- .name = "bfq.weight", -+ .name = BFQ_CGROUP_FNAME(weight), - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write_u64 = bfq_io_set_weight_legacy, -@@ -1005,106 +1011,106 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - /* statistics, covers only the tasks in the bfqg */ - { -- .name = "bfq.time", -+ .name = BFQ_CGROUP_FNAME(time), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.sectors", -+ .name = BFQ_CGROUP_FNAME(sectors), - .seq_show = bfqg_print_stat_sectors, - }, - { -- .name = "bfq.io_service_bytes", -+ .name = BFQ_CGROUP_FNAME(io_service_bytes), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, - }, - { -- .name = "bfq.io_serviced", -+ .name = BFQ_CGROUP_FNAME(io_serviced), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, - }, - { -- .name = "bfq.io_service_time", -+ .name = BFQ_CGROUP_FNAME(io_service_time), - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_wait_time", -+ .name = BFQ_CGROUP_FNAME(io_wait_time), - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_merged", -+ .name = BFQ_CGROUP_FNAME(io_merged), - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_queued", -+ .name = BFQ_CGROUP_FNAME(io_queued), - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, - - /* the same statictics which cover the bfqg and its descendants */ - { -- .name = "bfq.time_recursive", -+ .name = BFQ_CGROUP_FNAME(time_recursive), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { -- .name = "bfq.sectors_recursive", -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), - .seq_show = bfqg_print_stat_sectors_recursive, - }, - { -- .name = "bfq.io_service_bytes_recursive", -+ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, - }, - { -- .name = "bfq.io_serviced_recursive", -+ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, - }, - { -- .name = "bfq.io_service_time_recursive", -+ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_wait_time_recursive", -+ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_merged_recursive", -+ .name = BFQ_CGROUP_FNAME(io_merged_recursive), - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_queued_recursive", -+ .name = BFQ_CGROUP_FNAME(io_queued_recursive), - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.avg_queue_size", -+ .name = BFQ_CGROUP_FNAME(avg_queue_size), - .seq_show = bfqg_print_avg_queue_size, - }, - { -- .name = "bfq.group_wait_time", -+ .name = BFQ_CGROUP_FNAME(group_wait_time), - .private = offsetof(struct bfq_group, stats.group_wait_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.idle_time", -+ .name = BFQ_CGROUP_FNAME(idle_time), - .private = offsetof(struct bfq_group, stats.idle_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.empty_time", -+ .name = BFQ_CGROUP_FNAME(empty_time), - .private = offsetof(struct bfq_group, stats.empty_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.dequeue", -+ .name = BFQ_CGROUP_FNAME(dequeue), - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -@@ -1113,7 +1119,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - static struct cftype bfq_blkg_files[] = { - { -- .name = "bfq.weight", -+ .name = BFQ_CGROUP_FNAME(weight), - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write = bfq_io_set_weight, -@@ -1121,6 +1127,8 @@ static struct cftype bfq_blkg_files[] = { - {} /* terminate */ - }; - -+#undef BFQ_CGROUP_FNAME -+ - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - -From abdf7565dadbb00e78be5f4fb2cc9b157649840e Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 12 May 2017 11:56:13 +0200 -Subject: [PATCH 26/51] Add tentative extra tests on groups, reqs and queues - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 1 + - block/bfq-mq-iosched.c | 5 +++++ - include/linux/blkdev.h | 2 ++ - 3 files changed, 8 insertions(+) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 9e9b0a09e26f..72107ad12220 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -412,6 +412,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - BUG_ON(!blkg); - bfqg = blkg_to_bfqg(blkg); - bfqd = blkg->q->elevator->elevator_data; -+ BUG_ON(bfqg == bfqd->root_group); - entity = &bfqg->entity; - d = blkcg_to_bfqgd(blkg->blkcg); - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 3ae9bd424b3f..a9e3406fef06 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4494,6 +4494,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ BUG_ON(!bfqq); - - assert_spin_locked(&bfqd->lock); - -@@ -4587,6 +4588,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - "insert_request %p in disp: at_head %d", - rq, at_head); - } else { -+ BUG_ON(!(rq->rq_flags & RQF_GOT)); -+ rq->rq_flags &= ~RQF_GOT; -+ - __bfq_insert_request(bfqd, rq); - - if (rq_mergeable(rq)) { -@@ -4974,6 +4978,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -+ rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); - - return 0; -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 0048e59e6d07..9ae814743095 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -123,6 +123,8 @@ typedef __u32 __bitwise req_flags_t; - #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) - /* DEBUG: rq in bfq-mq dispatch list */ - #define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) -+/* DEBUG: rq had get_rq_private executed on it */ -+#define RQF_GOT ((__force req_flags_t)(1 << 20)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ - -From 9e1c1514bc947c4e04502331372b1cc58459d8d1 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 15 May 2017 22:25:03 +0200 -Subject: [PATCH 27/51] block, bfq-mq: access and cache blkg data only when - safe - -In blk-cgroup, operations on blkg objects are protected with the -request_queue lock. This is no more the lock that protects -I/O-scheduler operations in blk-mq. In fact, the latter are now -protected with a finer-grained per-scheduler-instance lock. As a -consequence, although blkg lookups are also rcu-protected, blk-mq I/O -schedulers may see inconsistent data when they access blkg and -blkg-related objects. BFQ does access these objects, and does incur -this problem, in the following case. - -The blkg_lookup performed in bfq_get_queue, being protected (only) -through rcu, may happen to return the address of a copy of the -original blkg. If this is the case, then the blkg_get performed in -bfq_get_queue, to pin down the blkg, is useless: it does not prevent -blk-cgroup code from destroying both the original blkg and all objects -directly or indirectly referred by the copy of the blkg. BFQ accesses -these objects, which typically causes a crash for NULL-pointer -dereference of memory-protection violation. - -Some additional protection mechanism should be added to blk-cgroup to -address this issue. In the meantime, this commit provides a quick -temporary fix for BFQ: cache (when safe) blkg data that might -disappear right after a blkg_lookup. - -In particular, this commit exploits the following facts to achieve its -goal without introducing further locks. Destroy operations on a blkg -invoke, as a first step, hooks of the scheduler associated with the -blkg. And these hooks are executed with bfqd->lock held for BFQ. As a -consequence, for any blkg associated with the request queue an -instance of BFQ is attached to, we are guaranteed that such a blkg is -not destroyed, and that all the pointers it contains are consistent, -while that instance is holding its bfqd->lock. A blkg_lookup performed -with bfqd->lock held then returns a fully consistent blkg, which -remains consistent until this lock is held. In more detail, this holds -even if the returned blkg is a copy of the original one. - -Finally, also the object describing a group inside BFQ needs to be -protected from destruction on the blkg_free of the original blkg -(which invokes bfq_pd_free). This commit adds private refcounting for -this object, to let it disappear only after no bfq_queue refers to it -any longer. - -This commit also removes or updates some stale comments on locking -issues related to blk-cgroup operations. - -Reported-by: Tomas Konir <tomas.konir@gmail.com> -Reported-by: Lee Tibbert <lee.tibbert@gmail.com> -Reported-by: Marco Piazza <mpiazza@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Tomas Konir <tomas.konir@gmail.com> -Tested-by: Lee Tibbert <lee.tibbert@gmail.com> -Tested-by: Marco Piazza <mpiazza@gmail.com> ---- - block/bfq-cgroup-included.c | 149 ++++++++++++++++++++++++++++++++++++++++---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-mq.h | 26 +++----- - 3 files changed, 148 insertions(+), 29 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 72107ad12220..d903393ee78a 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -43,7 +43,11 @@ BFQG_FLAG_FNS(idling) - BFQG_FLAG_FNS(empty) - #undef BFQG_FLAG_FNS - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) - { - unsigned long long now; -@@ -58,7 +62,11 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) - bfqg_stats_clear_waiting(stats); - } - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) - { -@@ -72,7 +80,11 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - bfqg_stats_mark_waiting(stats); - } - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) - { - unsigned long long now; -@@ -198,14 +210,43 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -- return blkg_get(bfqg_to_blkg(bfqg)); -+#ifdef BFQ_MQ -+ bfqg->ref++; -+#else -+ blkg_get(bfqg_to_blkg(bfqg)); -+#endif - } - - static void bfqg_put(struct bfq_group *bfqg) - { -- return blkg_put(bfqg_to_blkg(bfqg)); -+#ifdef BFQ_MQ -+ bfqg->ref--; -+ -+ BUG_ON(bfqg->ref < 0); -+ if (bfqg->ref == 0) -+ kfree(bfqg); -+#else -+ blkg_put(bfqg_to_blkg(bfqg)); -+#endif -+} -+ -+#ifdef BFQ_MQ -+static void bfqg_and_blkg_get(struct bfq_group *bfqg) -+{ -+ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ -+ bfqg_get(bfqg); -+ -+ blkg_get(bfqg_to_blkg(bfqg)); - } - -+static void bfqg_and_blkg_put(struct bfq_group *bfqg) -+{ -+ bfqg_put(bfqg); -+ -+ blkg_put(bfqg_to_blkg(bfqg)); -+} -+#endif -+ - static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, - unsigned int op) -@@ -310,7 +351,15 @@ static void bfq_init_entity(struct bfq_entity *entity, - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; -+#ifdef BFQ_MQ -+ /* -+ * Make sure that bfqg and its associated blkg do not -+ * disappear before entity. -+ */ -+ bfqg_and_blkg_get(bfqg); -+#else - bfqg_get(bfqg); -+#endif - } - entity->parent = bfqg->my_entity; /* NULL for root group */ - entity->sched_data = &bfqg->sched_data; -@@ -397,6 +446,10 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - return NULL; - } - -+#ifdef BFQ_MQ -+ /* see comments in bfq_bic_update_cgroup for why refcounting */ -+ bfqg_get(bfqg); -+#endif - return &bfqg->pd; - } - -@@ -432,7 +485,11 @@ static void bfq_pd_free(struct blkg_policy_data *pd) - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_exit(&bfqg->stats); -- return kfree(bfqg); -+#ifdef BFQ_MQ -+ bfqg_put(bfqg); -+#else -+ kfree(bfqg); -+#endif - } - - static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -@@ -516,9 +573,16 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating - * it on the new one. Avoid putting the entity on the old group idle tree. - * -+#ifdef BFQ_MQ -+ * Must be called under the scheduler lock, to make sure that the blkg -+ * owning @bfqg does not disappear (see comments in -+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg -+ * objects). -+#else - * Must be called under the queue lock; the cgroup owning @bfqg must - * not disappear (by now this just means that we are called under - * rcu_read_lock()). -+#endif - */ - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) -@@ -555,16 +619,20 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->tree); - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } -+#ifdef BFQ_MQ -+ bfqg_and_blkg_put(bfqq_group(bfqq)); -+#else - bfqg_put(bfqq_group(bfqq)); -+#endif - -- /* -- * Here we use a reference to bfqg. We don't need a refcounter -- * as the cgroup reference will not be dropped, so that its -- * destroy() callback will not be invoked. -- */ - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; -+#ifdef BFQ_MQ -+ /* pin down bfqg and its associated blkg */ -+ bfqg_and_blkg_get(bfqg); -+#else - bfqg_get(bfqg); -+#endif - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_busy(bfqq)) { -@@ -585,8 +653,14 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. - * -+#ifdef BFQ_MQ -+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes -+ * sure that the reference to cgroup is valid across the call (see -+ * comments in bfq_bic_update_cgroup on this issue) -+#else - * Move bic to blkcg, assuming that bfqd->queue is locked; the caller - * has to make sure that the reference to cgroup is valid across the call. -+#endif - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup -@@ -645,6 +719,59 @@ static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) - goto out; - - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+#ifdef BFQ_MQ -+ /* -+ * Update blkg_path for bfq_log_* functions. We cache this -+ * path, and update it here, for the following -+ * reasons. Operations on blkg objects in blk-cgroup are -+ * protected with the request_queue lock, and not with the -+ * lock that protects the instances of this scheduler -+ * (bfqd->lock). This exposes BFQ to the following sort of -+ * race. -+ * -+ * The blkg_lookup performed in bfq_get_queue, protected -+ * through rcu, may happen to return the address of a copy of -+ * the original blkg. If this is the case, then the -+ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down -+ * the blkg, is useless: it does not prevent blk-cgroup code -+ * from destroying both the original blkg and all objects -+ * directly or indirectly referred by the copy of the -+ * blkg. -+ * -+ * On the bright side, destroy operations on a blkg invoke, as -+ * a first step, hooks of the scheduler associated with the -+ * blkg. And these hooks are executed with bfqd->lock held for -+ * BFQ. As a consequence, for any blkg associated with the -+ * request queue this instance of the scheduler is attached -+ * to, we are guaranteed that such a blkg is not destroyed, and -+ * that all the pointers it contains are consistent, while we -+ * are holding bfqd->lock. A blkg_lookup performed with -+ * bfqd->lock held then returns a fully consistent blkg, which -+ * remains consistent until this lock is held. -+ * -+ * Thanks to the last fact, and to the fact that: (1) bfqg has -+ * been obtained through a blkg_lookup in the above -+ * assignment, and (2) bfqd->lock is being held, here we can -+ * safely use the policy data for the involved blkg (i.e., the -+ * field bfqg->pd) to get to the blkg associated with bfqg, -+ * and then we can safely use any field of blkg. After we -+ * release bfqd->lock, even just getting blkg through this -+ * bfqg may cause dangling references to be traversed, as -+ * bfqg->pd may not exist any more. -+ * -+ * In view of the above facts, here we cache, in the bfqg, any -+ * blkg data we may need for this bic, and for its associated -+ * bfq_queue. As of now, we need to cache only the path of the -+ * blkg, which is used in the bfq_log_* functions. -+ * -+ * Finally, note that bfqg itself needs to be protected from -+ * destruction on the blkg_free of the original blkg (which -+ * invokes bfq_pd_free). We use an additional private -+ * refcounter for bfqg, to let it disappear only after no -+ * bfq_queue refers to it any longer. -+ */ -+ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); -+#endif - bic->blkcg_serial_nr = serial_nr; - out: - rcu_read_unlock(); -@@ -682,8 +809,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - * @bfqd: the device data structure with the root group. - * @bfqg: the group to move from. - * @st: the service tree with the entities. -- * -- * Needs queue_lock to be taken and reference to be valid over the call. - */ - static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, -@@ -736,6 +861,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - #ifdef BFQ_MQ - spin_lock_irqsave(&bfqd->lock, flags); - #endif -+ - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -746,8 +872,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different -- * cgroup from the one being destroyed now. No one else -- * can access them so it's safe to act without any lock. -+ * cgroup from the one being destroyed now. - */ - bfq_flush_idle_tree(st); - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index a9e3406fef06..4eb668eeacdc 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4073,7 +4073,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- bfqg_put(bfqg); -+ bfqg_and_blkg_put(bfqg); - #endif - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 36ee24a87dda..77ab0f22ed22 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -695,23 +695,17 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s bfq%d%c %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ bfqq_group(bfqq)->blkg_path, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- __pbuf, ##args); \ -+ bfqg->blkg_path, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ -@@ -736,20 +730,14 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ bfqq_group(bfqq)->blkg_path, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ -@@ -860,6 +848,12 @@ struct bfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - -+ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ -+ char blkg_path[128]; -+ -+ /* reference counter (see comments in bfq_bic_update_cgroup) */ -+ int ref; -+ - struct bfq_entity entity; - struct bfq_sched_data sched_data; - - -From c9137b749aceef6c2dde88e99b2fc978d5952e76 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 17 Jun 2017 11:18:11 +0200 -Subject: [PATCH 28/51] bfq-mq: fix macro name in conditional invocation of - policy_unregister - -This commit fixes the name of the macro in the conditional group that -invokes blkcg_policy_unregister in bfq_exit for bfq-mq. Because of -this error, blkcg_policy_unregister was never invoked. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 4eb668eeacdc..bc1de3f70ea8 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5669,7 +5669,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq_mq); --#ifdef CONFIG_BFQ_GROUP_ENABLED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); - -From c7ceb37496f63b2dba4d06946ab85ec97b87bfb5 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 11:48:17 +0200 -Subject: [PATCH 29/51] Port of "blk-mq-sched: unify request finished methods" - -No need to have two different callouts of bfq vs kyber. - -Signed-off-by: Christoph Hellwig <hch@lst.de> -Signed-off-by: Jens Axboe <axboe@kernel.dk> ---- - block/bfq-mq-iosched.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bc1de3f70ea8..2598602a0b10 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4753,7 +4753,7 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - bfq_put_queue(bfqq); - } - --static void bfq_put_rq_private(struct request_queue *q, struct request *rq) -+static void bfq_finish_request(struct request *rq) - { - struct bfq_queue *bfqq; - struct bfq_data *bfqd; -@@ -4814,7 +4814,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - - assert_spin_locked(&bfqd->lock); - if (!RB_EMPTY_NODE(&rq->rb_node)) -- bfq_remove_request(q, rq); -+ bfq_remove_request(rq->q, rq); - bfq_put_rq_priv_body(bfqq); - } - -@@ -5558,7 +5558,7 @@ static struct elv_fs_entry bfq_attrs[] = { - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .get_rq_priv = bfq_get_rq_private, -- .put_rq_priv = bfq_put_rq_private, -+ .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, - -From 12bef026fe114ab5e2e284772ddc52a8be83fdbc Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 11:54:57 +0200 -Subject: [PATCH 30/51] Port of "bfq-iosched: fix NULL ioc check in - bfq_get_rq_private" - -icq_to_bic is a container_of operation, so we need to check for NULL -before it. Also move the check outside the spinlock while we're at -it. - -Signed-off-by: Christoph Hellwig <hch@lst.de> -Signed-off-by: Jens Axboe <axboe@kernel.dk> ---- - block/bfq-mq-iosched.c | 15 +++++---------- - 1 file changed, 5 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 2598602a0b10..c57774a60911 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4903,16 +4903,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ struct bfq_io_cq *bic; - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - bool bfqq_already_existing = false, split = false; - bool new_queue = false; - -- spin_lock_irq(&bfqd->lock); -+ if (!rq->elv.icq) -+ return 1; -+ bic = icq_to_bic(rq->elv.icq); - -- if (!bic) -- goto queue_fail; -+ spin_lock_irq(&bfqd->lock); - - bfq_check_ioprio_change(bic, bio); - -@@ -4980,13 +4981,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); -- - return 0; -- --queue_fail: -- spin_unlock_irq(&bfqd->lock); -- -- return 1; - } - - static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - -From 633e5711347df1bf4ca935fd0aa9118a0054f75d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 12:02:16 +0200 -Subject: [PATCH 31/51] Port of "blk-mq-sched: unify request prepare methods" - -This patch makes sure we always allocate requests in the core blk-mq -code and use a common prepare_request method to initialize them for -both mq I/O schedulers. For Kyber and additional limit_depth method -is added that is called before allocating the request. - -Also because none of the intializations can really fail the new method -does not return an error - instead the bfq finish method is hardened -to deal with the no-IOC case. - -Last but not least this removes the abuse of RQF_QUEUE by the blk-mq -scheduling code as RQF_ELFPRIV is all that is needed now. - -Signed-off-by: Christoph Hellwig <hch@lst.de> -Signed-off-by: Jens Axboe <axboe@kernel.dk> ---- - block/bfq-mq-iosched.c | 13 ++++++++----- - 1 file changed, 8 insertions(+), 5 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index c57774a60911..49ffca1ad6e7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4760,6 +4760,10 @@ static void bfq_finish_request(struct request *rq) - struct bfq_io_cq *bic; - - BUG_ON(!rq); -+ -+ if (!rq->elv.icq) -+ return; -+ - bfqq = RQ_BFQQ(rq); - BUG_ON(!bfqq); - -@@ -4899,9 +4903,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - /* - * Allocate bfq data structures associated with this request. - */ --static int bfq_get_rq_private(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static void bfq_prepare_request(struct request *rq, struct bio *bio) - { -+ struct request_queue *q = rq->q; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic; - const int is_sync = rq_is_sync(rq); -@@ -4910,7 +4914,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bool new_queue = false; - - if (!rq->elv.icq) -- return 1; -+ return; - bic = icq_to_bic(rq->elv.icq); - - spin_lock_irq(&bfqd->lock); -@@ -4981,7 +4985,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); -- return 0; - } - - static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) -@@ -5552,7 +5555,7 @@ static struct elv_fs_entry bfq_attrs[] = { - - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { -- .get_rq_priv = bfq_get_rq_private, -+ .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - -From 5a321acfce282c3e58ac63582faf6f928ad17f27 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 12:43:22 +0200 -Subject: [PATCH 32/51] Add list of bfq instances to documentation - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 11 ++++++++++- - 1 file changed, 10 insertions(+), 1 deletion(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 3d6951d63489..8ce6b9a9bacd 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -11,6 +11,15 @@ controllers), BFQ's main features are: - groups (switching back to time distribution when needed to keep - throughput high). - -+If bfq-mq patches have been applied, then the following three -+instances of BFQ are available (otherwise only the first instance): -+- bfq: mainline version of BFQ, for blk-mq -+- bfq-mq: development version of BFQ for blk-mq; this version contains -+ also all latest features not yet landed in mainline, plus many -+ safety checks -+- bfq: BFQ for legacy blk; also this version contains both latest -+ features and safety checks -+ - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - schedules that may lead to a lower throughput. If your main or only -@@ -27,7 +36,7 @@ sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and - to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on - multi-queue devices too. - --The table of contents follow. Impatients can just jump to Section 3. -+The table of contents follows. Impatients can just jump to Section 3. - - CONTENTS - - -From 9f2e5b27227fd9254cc258572dc2d4531838c30b Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 16:28:00 +0200 -Subject: [PATCH 33/51] bfq-sq: fix prefix of names of cgroups parameters - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 12 +++++++----- - block/bfq-cgroup-included.c | 2 +- - 2 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 8ce6b9a9bacd..965d82f94db9 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -503,10 +503,12 @@ To get proportional sharing of bandwidth with BFQ for a given device, - BFQ must of course be the active scheduler for that device. - - Within each group directory, the names of the files associated with --BFQ-specific cgroup parameters and stats begin with the "bfq." --prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for --BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group --parameter to set the weight of a group with BFQ is blkio.bfq.weight -+BFQ-specific cgroup parameters and stats begin with the "bfq.", -+"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you -+want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for -+BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" -+(i.e., null string), "-sq" or "-mq". For example, the group parameter -+to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - - Parameters to set -@@ -514,7 +516,7 @@ Parameters to set - - For each group, there is only the following parameter to set. - --weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -+weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the - group inside its parent. Available values: 1..10000 (default 100). The - linear mapping between ioprio and weights, described at the beginning - of the tunable section, is still valid, but all weights higher than -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index d903393ee78a..631e53d9150d 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -1124,7 +1124,7 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - #ifdef BFQ_MQ - #define BFQ_CGROUP_FNAME(param) "bfq-mq."#param - #else --#define BFQ_CGROUP_FNAME(param) "bfq."#param -+#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param - #endif - - static struct cftype bfq_blkcg_legacy_files[] = { - -From 92b42df8166939ccf26aa450125b5b575cf6d505 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 21:08:32 +0200 -Subject: [PATCH 34/51] Add to documentation that bfq-mq and bfq-sq contain - last fixes too - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 965d82f94db9..0e59f1c9d30e 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -15,10 +15,10 @@ If bfq-mq patches have been applied, then the following three - instances of BFQ are available (otherwise only the first instance): - - bfq: mainline version of BFQ, for blk-mq - - bfq-mq: development version of BFQ for blk-mq; this version contains -- also all latest features not yet landed in mainline, plus many -+ also all latest features and fixes not yet landed in mainline, plus many - safety checks --- bfq: BFQ for legacy blk; also this version contains both latest -- features and safety checks -+- bfq: BFQ for legacy blk; also this version contains latest features -+ and fixes, as well as safety checks - - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - -From 7f9bdd433b848d4f53c167258bf4d0b3f1ae1923 Mon Sep 17 00:00:00 2001 -From: Lee Tibbert <lee.tibbert@gmail.com> -Date: Wed, 19 Jul 2017 10:28:32 -0400 -Subject: [PATCH 35/51] Improve most frequently used no-logging path - -This patch originated as a fix for compiler unused-variable warnings -issued when compiling bfq-mq with logging disabled (both -CONFIG_BLK_DEV_IO_TRACE and CONFIG_BFQ_REDIRECT_TO_CONSOLE -undefined). - -It turns out to also have benefits for the bfq-sq path as well. - -In most performance sensitive production builds blktrace_api logging -will probably be turned off, so it is worth making the no-logging path -compile without warnings. Any performance benefit is a bonus. - -Thank you to T. B. on the bfq-iosched@googlegroups.com list -for ((void) (bfqq)) simplification/suggestion/improvement. All bugs -and unclear descriptions are my own doing. - -The discussion below is based on the gcc compiler with optimization -level of at least 02. Lower optimization levels are unlikely to -remove no-op instruction equivalents. - -Provide three improvements in this likely case. - - 1) Fix multiple occurrences of an unused-variable warning - issued when compiling bfq-mq with no logging. The warning - occurred each time the bfq_log_bfqg macro was expanded inside - a code block such as the following snippet from - block/bfq-sched.c, line 139 and few following, lightly edited for - indentation in order to pass checkpatch.pl maximum line lengths. - -else { - struct bfq_group *bfqg = - container_of(next_in_service, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "update_next_in_service: chosen this entity"); - } - - Previously bfq-mq.h expanded bfq_log_bfqg to blk_add_trace_msg. - When both bfq console logging and blktrace_api logging are - disabled, include/linux/blktrace_api expands to - do { } while (0), leaving the code block local variable unused. - - bfq_log_bfqq() had similar behavior but is never called with - a potentially unused variable. This patch fixes that macro for - consistency. - - bfq-sq.h (single queue) with blktrace_api enabled, and the bfq - console logging macros have code paths which not trigger this - warning. - - kernel.org (4.12 & 4.13) bfq (bfq-iosched.h) could trigger - the warning but no code does so now. This patch fixes - bfq-iosched.h for consistency. - - The style above enables a software engineering approach where - complex expressions are moved to a local variable before the - bfq_log* call. This makes it easier to read the expression and - use breakpoints to verify it. bfq-mq uses this approach in - several places. - - New bfq_log* macros are provided for the no-logging case. - I touch only the second argument, because current code never - uses the local variable approach with the first or other - arguments. I tried to balance consistency with simplicity. - - 2) For bfq-sq, reduce to zero, the number of instructions executed - when no logging is configured. No sense marshaling arguments - which are never going to be used. - - On a trial V8R11 builds, this reduced the size of bfq-iosched.o - by 14.3 KiB. The size went from 70304 to 55664 bytes. - - bfq-mq and kernel.org bfq code size does not change because - existing macros already optimize to zero bytes when not logging. - The current changes maintains consistency with the bfq-sq path - and makes the bfq-mq & bfq no-logging paths resistant to future - logging path macro changes which might cause generated code. - - 3) Slightly reduce compile time of all bfq variants by including - blktrace_api.h only when it will be used. - -Signed-off-by: Lee Tibbert <lee.tibbert@gmail.com> ---- - block/bfq-mq.h | 18 +++++++++++++++++- - block/bfq.h | 18 +++++++++++++++++- - 2 files changed, 34 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 77ab0f22ed22..7ed2cc29be57 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -15,7 +15,6 @@ - #ifndef _BFQ_H - #define _BFQ_H - --#include <linux/blktrace_api.h> - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - -@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -@@ -752,6 +766,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ -diff --git a/block/bfq.h b/block/bfq.h -index 53954d1b87f8..15d326f466b7 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -15,7 +15,6 @@ - #ifndef _BFQ_H - #define _BFQ_H - --#include <linux/blktrace_api.h> - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - -@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -@@ -759,6 +773,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ - -From f11a0e751e741bf94c6a48234824d50b3c0100ad Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 9 Aug 2017 16:40:39 +0200 -Subject: [PATCH 36/51] bfq-sq: fix commit "Remove all get and put of I/O - contexts" in branch bfq-mq - -The commit "Remove all get and put of I/O contexts" erroneously removed -the reset of the field in_service_bic for bfq-sq. This commit re-adds -that missing reset. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 7 +++++++ - block/bfq-sq-iosched.c | 1 + - 2 files changed, 8 insertions(+) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 9c4e6797d8c9..7425824c26b8 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1904,6 +1904,13 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; - -+#ifndef BFQ_MQ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+#endif -+ - bfq_clear_bfqq_wait_request(in_serv_bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 25da0d1c0622..e1960bf149d8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3765,6 +3765,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); -+ BUG_ON(!bfqd->in_service_bic); - } - - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - -From eceae5457530df8598557767d7be258ca9384de4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 9 Aug 2017 22:29:01 +0200 -Subject: [PATCH 37/51] bfq-sq-mq: make lookup_next_entity push up vtime on - expirations - -To provide a very smooth service, bfq starts to serve a bfq_queue -only if the queue is 'eligible', i.e., if the same queue would -have started to be served in the ideal, perfectly fair system that -bfq simulates internally. This is obtained by associating each -queue with a virtual start time, and by computing a special system -virtual time quantity: a queue is eligible only if the system -virtual time has reached the virtual start time of the -queue. Finally, bfq guarantees that, when a new queue must be set -in service, there is always at least one eligible entity for each -active parent entity in the scheduler. To provide this guarantee, -the function __bfq_lookup_next_entity pushes up, for each parent -entity on which it is invoked, the system virtual time to the -minimum among the virtual start times of the entities in the -active tree for the parent entity (more precisely, the push up -occurs if the system virtual time happens to be lower than all -such virtual start times). - -There is however a circumstance in which __bfq_lookup_next_entity -cannot push up the system virtual time for a parent entity, even -if the system virtual time is lower than the virtual start times -of all the child entities in the active tree. It happens if one of -the child entities is in service. In fact, in such a case, there -is already an eligible entity, the in-service one, even if it may -not be not present in the active tree (because in-service entities -may be removed from the active tree). - -Unfortunately, in the last re-design of the -hierarchical-scheduling engine, the reset of the pointer to the -in-service entity for a given parent entity--reset to be done as a -consequence of the expiration of the in-service entity--always -happens after the function __bfq_lookup_next_entity has been -invoked. This causes the function to think that there is still an -entity in service for the parent entity, and then that the system -virtual time cannot be pushed up, even if actually such a -no-more-in-service entity has already been properly reinserted -into the active tree (or in some other tree if no more -active). Yet, the system virtual time *had* to be pushed up, to be -ready to correctly choose the next queue to serve. Because of the -lack of this push up, bfq may wrongly set in service a queue that -had been speculatively pre-computed as the possible -next-in-service queue, but that would no more be the one to serve -after the expiration and the reinsertion into the active trees of -the previously in-service entities. - -This commit addresses this issue by making -__bfq_lookup_next_entity properly push up the system virtual time -if an expiration is occurring. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 4 +-- - block/bfq-sched.c | 77 ++++++++++++++++++++++++++++++++------------------ - block/bfq-sq-iosched.c | 4 +-- - 3 files changed, 53 insertions(+), 32 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 49ffca1ad6e7..b5c848650375 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -682,7 +682,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, false); - } - } - -@@ -2822,7 +2822,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, true); - /* - * Resort priority tree of potential close cooperators. - */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 7425824c26b8..f3001af37256 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -33,7 +33,8 @@ static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) - return rb_entry(node, struct bfq_entity, rb_node); - } - --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration); - - static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - -@@ -43,6 +44,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - * @new_entity: if not NULL, pointer to the entity whose activation, - * requeueing or repositionig triggered the invocation of - * this function. -+ * @expiration: id true, this function is being invoked after the -+ * expiration of the in-service entity - * - * This function is called to update sd->next_in_service, which, in - * its turn, may change as a consequence of the insertion or -@@ -61,7 +64,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - * entity. - */ - static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -- struct bfq_entity *new_entity) -+ struct bfq_entity *new_entity, -+ bool expiration) - { - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; -@@ -120,7 +124,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - if (replace_next) - next_in_service = new_entity; - } else /* invoked because of a deactivation: lookup needed */ -- next_in_service = bfq_lookup_next_entity(sd); -+ next_in_service = bfq_lookup_next_entity(sd, expiration); - - if (next_in_service) { - parent_sched_may_change = !sd->next_in_service || -@@ -1291,10 +1295,12 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - * @requeue: true if this is a requeue, which implies that bfqq is - * being expired; thus ALL its ancestors stop being served and must - * therefore be requeued -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue - */ - static void bfq_activate_requeue_entity(struct bfq_entity *entity, - bool non_blocking_wait_rq, -- bool requeue) -+ bool requeue, bool expiration) - { - struct bfq_sched_data *sd; - -@@ -1307,7 +1313,8 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, - RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && - RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); - -- if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ if (!bfq_update_next_in_service(sd, entity, expiration) && -+ !requeue) { - BUG_ON(!sd->next_in_service); - break; - } -@@ -1373,6 +1380,8 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put into the idle tree -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue - */ - static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, -@@ -1417,7 +1426,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - * then, since entity has just been - * deactivated, a new one must be found. - */ -- bfq_update_next_in_service(sd, NULL); -+ bfq_update_next_in_service(sd, NULL, expiration); - - if (sd->next_in_service || sd->in_service_entity) { - /* -@@ -1495,7 +1504,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - "invoking udpdate_next for this entity"); - } - #endif -- if (!bfq_update_next_in_service(sd, entity) && -+ if (!bfq_update_next_in_service(sd, entity, expiration) && - !expiration) - /* - * next_in_service unchanged or not causing -@@ -1524,7 +1533,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", -- root_entity->min_start); -+ ((root_entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -1533,7 +1542,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_vtime_jump: new value %llu", -- root_entity->min_start); -+ ((root_entity->min_start>>10)*1000)>>12); - } - #endif - return root_entity->min_start; -@@ -1615,17 +1624,9 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, - * 3) is idle. - */ - static struct bfq_entity * --__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service --#if 0 -- , bool force --#endif -- ) -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - { -- struct bfq_entity *entity --#if 0 -- , *new_next_in_service = NULL --#endif -- ; -+ struct bfq_entity *entity; - u64 new_vtime; - struct bfq_queue *bfqq; - -@@ -1667,8 +1668,9 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__lookup_next: start %llu vtime %llu st %p", -+ "__lookup_next: start %llu vtime %llu (%llu) st %p", - ((entity->start>>10)*1000)>>12, -+ ((st->vtime>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); - } - #endif -@@ -1681,12 +1683,14 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - /** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. -+ * @expiration: true if we are on the expiration path of the in-service queue - * - * This function is invoked when there has been a change in the trees -- * for sd, and we need know what is the new next entity after this -- * change. -+ * for sd, and we need to know what is the new next entity to serve -+ * after this change. - */ --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration) - { - struct bfq_service_tree *st = sd->service_tree; - struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -@@ -1716,8 +1720,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - * class, unless the idle class needs to be served. - */ - for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ /* -+ * If expiration is true, then bfq_lookup_next_entity -+ * is being invoked as a part of the expiration path -+ * of the in-service queue. In this case, even if -+ * sd->in_service_entity is not NULL, -+ * sd->in_service_entiy at this point is actually not -+ * in service any more, and, if needed, has already -+ * been properly queued or requeued into the right -+ * tree. The reason why sd->in_service_entity is still -+ * not NULL here, even if expiration is true, is that -+ * sd->in_service_entiy is reset as a last step in the -+ * expiration path. So, if expiration is true, tell -+ * __bfq_lookup_next_entity that there is no -+ * sd->in_service_entity. -+ */ - entity = __bfq_lookup_next_entity(st + class_idx, -- sd->in_service_entity); -+ sd->in_service_entity && -+ !expiration); - - if (entity) - break; -@@ -1891,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->sched_data; - -- if(!bfq_update_next_in_service(sd, NULL)) -+ if (!bfq_update_next_in_service(sd, NULL, false)) - break; - } - -@@ -1951,16 +1971,17 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - entity->on_st); - - bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -- false); -+ false, false); - bfq_clear_bfqq_non_blocking_wait_rq(bfqq); - } - --static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) - { - struct bfq_entity *entity = &bfqq->entity; - - bfq_activate_requeue_entity(entity, false, -- bfqq == bfqd->in_service_queue); -+ bfqq == bfqd->in_service_queue, expiration); - } - - static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e1960bf149d8..42393ab889a9 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -644,7 +644,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, false); - } - } - -@@ -2715,7 +2715,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, true); - /* - * Resort priority tree of potential close cooperators. - */ - -From ee9f95b24e1d88ffba4845981c2a4684aefd0245 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 9 Aug 2017 22:53:00 +0200 -Subject: [PATCH 38/51] bfq-sq-mq: remove direct switch to an entity in higher - class - -If the function bfq_update_next_in_service is invoked as a consequence -of the activation or requeueing of an entity, say E, and finds out -that E belongs to a higher-priority class than that of the current -next-in-service entity, then it sets next_in_service directly to -E. But this may lead to anomalous schedules, because E may happen not -be eligible for service, because its virtual start time is higher than -the system virtual time for its service tree. - -This commit addresses this issue by simply removing this direct -switch. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 19 +++++-------------- - 1 file changed, 5 insertions(+), 14 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index f3001af37256..b1a59088db88 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -76,9 +76,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - * or repositiong of an entity that does not coincide with - * sd->next_in_service, then a full lookup in the active tree - * can be avoided. In fact, it is enough to check whether the -- * just-modified entity has a higher priority than -- * sd->next_in_service, or, even if it has the same priority -- * as sd->next_in_service, is eligible and has a lower virtual -+ * just-modified entity has the same priority as -+ * sd->next_in_service, is eligible and has a lower virtual - * finish time than sd->next_in_service. If this compound - * condition holds, then the new entity becomes the new - * next_in_service. Otherwise no change is needed. -@@ -94,9 +93,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - - /* - * If there is already a next_in_service candidate -- * entity, then compare class priorities or timestamps -- * to decide whether to replace sd->service_tree with -- * new_entity. -+ * entity, then compare timestamps to decide whether -+ * to replace sd->service_tree with new_entity. - */ - if (next_in_service) { - unsigned int new_entity_class_idx = -@@ -104,10 +102,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - -- /* -- * For efficiency, evaluate the most likely -- * sub-condition first. -- */ - replace_next = - (new_entity_class_idx == - bfq_class_idx(next_in_service) -@@ -115,10 +109,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - !bfq_gt(new_entity->start, st->vtime) - && - bfq_gt(next_in_service->finish, -- new_entity->finish)) -- || -- new_entity_class_idx < -- bfq_class_idx(next_in_service); -+ new_entity->finish)); - } - - if (replace_next) - -From a3fdc5af40537355b68c1f0d3997c5a5fb54b9ce Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 10 Aug 2017 08:15:50 +0200 -Subject: [PATCH 39/51] bfq-sq-mq: guarantee update_next_in_service always - returns an eligible entity - -If the function bfq_update_next_in_service is invoked as a consequence -of the activation or requeueing of an entity, say E, then it doesn't -invoke bfq_lookup_next_entity to get the next-in-service entity. In -contrast, it follows a shorter path: if E happens to be eligible (see -commit "bfq-sq-mq: make lookup_next_entity push up vtime on -expirations" for details on eligibility) and to have a lower virtual -finish time than the current candidate as next-in-service entity, then -E directly becomes the next-in-service entity. Unfortunately, there is -a corner case for which this shorter path makes -bfq_update_next_in_service choose a non eligible entity: it occurs if -both E and the current next-in-service entity happen to be non -eligible when bfq_update_next_in_service is invoked. In this case, E -is not set as next-in-service, and, since bfq_lookup_next_entity is -not invoked, the state of the parent entity is not updated so as to -end up with an eligible entity as the proper next-in-service entity. - -In this respect, next-in-service is actually allowed to be non -eligible while some queue is in service: since no system-virtual-time -push-up can be performed in that case (see again commit "bfq-sq-mq: -make lookup_next_entity push up vtime on expirations" for details), -next-in-service is chosen, speculatively, as a function of the -possible value that the system virtual time may get after a push -up. But the correctness of the schedule breaks if next-in-service is -still a non eligible entity when it is time to set in service the next -entity. Unfortunately, this may happen in the above corner case. - -This commit fixes this problem by making bfq_update_next_in_service -invoke bfq_lookup_next_entity not only if the above shorter path -cannot be taken, but also if the shorter path is taken but fails to -yield an eligible next-in-service entity. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 38 ++++++++++++++++++++++++++++---------- - 1 file changed, 28 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index b1a59088db88..e4a2553a2d2c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -70,6 +70,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; - bool parent_sched_may_change = false; -+ bool change_without_lookup = false; - - /* - * If this update is triggered by the activation, requeueing -@@ -89,7 +90,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - * set to true, and left as true if - * sd->next_in_service is NULL. - */ -- bool replace_next = true; -+ change_without_lookup = true; - - /* - * If there is already a next_in_service candidate -@@ -102,7 +103,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - -- replace_next = -+ change_without_lookup = - (new_entity_class_idx == - bfq_class_idx(next_in_service) - && -@@ -112,15 +113,32 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - new_entity->finish)); - } - -- if (replace_next) -+ if (change_without_lookup) { - next_in_service = new_entity; -- } else /* invoked because of a deactivation: lookup needed */ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chose without lookup"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -+ "update_next_in_service: chose without lookup"); -+ } -+#endif -+ } -+ } -+ -+ if (!change_without_lookup) /* lookup needed */ - next_in_service = bfq_lookup_next_entity(sd, expiration); - -- if (next_in_service) { -+ if (next_in_service) - parent_sched_may_change = !sd->next_in_service || - bfq_update_parent_budget(next_in_service); -- } - - sd->next_in_service = next_in_service; - -@@ -1053,7 +1071,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__activate_entity: new queue finish %llu", -+ "update_fin_time_enqueue: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1061,7 +1079,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__activate_entity: new group finish %llu", -+ "update_fin_time_enqueue: new group finish %llu", - ((entity->finish>>10)*1000)>>12); - #endif - } -@@ -1071,7 +1089,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__activate_entity: queue %seligible in st %p", -+ "update_fin_time_enqueue: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1079,7 +1097,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__activate_entity: group %seligible in st %p", -+ "update_fin_time_enqueue: group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #endif - } - -From 6565e4d1aac029b6f0a5d86a4c6ef38608838eac Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 31 Aug 2017 19:24:26 +0200 -Subject: [PATCH 40/51] doc, block, bfq: fix some typos and stale sentences - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Reviewed-by: Jeremy Hickman <jeremywh7@gmail.com> -Reviewed-by: Laurentiu Nicola <lnicola@dend.ro> ---- - Documentation/block/bfq-iosched.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 0e59f1c9d30e..dcfe15523da3 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -17,7 +17,7 @@ instances of BFQ are available (otherwise only the first instance): - - bfq-mq: development version of BFQ for blk-mq; this version contains - also all latest features and fixes not yet landed in mainline, plus many - safety checks --- bfq: BFQ for legacy blk; also this version contains latest features -+- bfq-sq: BFQ for legacy blk; also this version contains latest features - and fixes, as well as safety checks - - In its default configuration, BFQ privileges latency over - -From 261ee8cc9f43e03d790a07184f0bcaa504ee6737 Mon Sep 17 00:00:00 2001 -From: Luca Miccio <lucmiccio@gmail.com> -Date: Wed, 13 Sep 2017 12:03:56 +0200 -Subject: [PATCH 41/51] bfq-mq, bfq-sq: Disable writeback throttling - -Similarly to CFQ, BFQ has its write-throttling heuristics, and it -is better not to combine them with further write-throttling -heuristics of a different nature. -So this commit disables write-back throttling for a device if BFQ -is used as I/O scheduler for that device. - -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> ---- - block/bfq-mq-iosched.c | 2 ++ - block/bfq-sq-iosched.c | 7 +++++++ - 2 files changed, 9 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b5c848650375..7d27d5b3befb 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -89,6 +89,7 @@ - #include "blk-mq-tag.h" - #include "blk-mq-sched.h" - #include "bfq-mq.h" -+#include "blk-wbt.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -@@ -5260,6 +5261,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - -+ wbt_disable_default(q); - return 0; - - out_free: -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 42393ab889a9..6fdc3b1d5bb8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -83,6 +83,7 @@ - #include <linux/ioprio.h> - #include "blk.h" - #include "bfq.h" -+#include "blk-wbt.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -@@ -4976,6 +4977,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - return -ENOMEM; - } - -+static void bfq_registered_queue(struct request_queue *q) -+{ -+ wbt_disable_default(q); -+} -+ - static void bfq_slab_kill(void) - { - kmem_cache_destroy(bfq_pool); -@@ -5285,6 +5291,7 @@ static struct elevator_type iosched_bfq = { - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, -+ .elevator_registered_fn = bfq_registered_queue, - }, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - -From 40ea0aed088791da27fcfa51f3b64d1f96b0d06e Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 12 Sep 2017 16:45:53 +0200 -Subject: [PATCH 42/51] bfq-mq, bfq-sq: fix wrong init of saved start time for - weight raising - -This commit fixes a bug that causes bfq to fail to guarantee a high -responsiveness on some drives, if there is heavy random read+write I/O -in the background. More precisely, such a failure allowed this bug to -be found [1], but the bug may well cause other yet unreported -anomalies. - -BFQ raises the weight of the bfq_queues associated with soft real-time -applications, to privilege the I/O, and thus reduce latency, for these -applications. This mechanism is named soft-real-time weight raising in -BFQ. A soft real-time period may happen to be nested into an -interactive weight raising period, i.e., it may happen that, when a -bfq_queue switches to a soft real-time weight-raised state, the -bfq_queue is already being weight-raised because deemed interactive -too. In this case, BFQ saves in a special variable -wr_start_at_switch_to_srt, the time instant when the interactive -weight-raising period started for the bfq_queue, i.e., the time -instant when BFQ started to deem the bfq_queue interactive. This value -is then used to check whether the interactive weight-raising period -would still be in progress when the soft real-time weight-raising -period ends. If so, interactive weight raising is restored for the -bfq_queue. This restore is useful, in particular, because it prevents -bfq_queues from losing their interactive weight raising prematurely, -as a consequence of spurious, short-lived soft real-time -weight-raising periods caused by wrong detections as soft real-time. - -If, instead, a bfq_queue switches to soft-real-time weight raising -while it *is not* already in an interactive weight-raising period, -then the variable wr_start_at_switch_to_srt has no meaning during the -following soft real-time weight-raising period. Unfortunately the -handling of this case is wrong in BFQ: not only the variable is not -flagged somehow as meaningless, but it is also set to the time when -the switch to soft real-time weight-raising occurs. This may cause an -interactive weight-raising period to be considered mistakenly as still -in progress, and thus a spurious interactive weight-raising period to -start for the bfq_queue, at the end of the soft-real-time -weight-raising period. In particular the spurious interactive -weight-raising period will be considered as still in progress, if the -soft-real-time weight-raising period does not last very long. The -bfq_queue will then be wrongly privileged and, if I/O bound, will -unjustly steal bandwidth to truly interactive or soft real-time -bfq_queues, harming responsiveness and low latency. - -This commit fixes this issue by just setting wr_start_at_switch_to_srt -to minus infinity (farthest past time instant according to jiffies -macros): when the soft-real-time weight-raising period ends, certainly -no interactive weight-raising period will be considered as still in -progress. - -[1] Background I/O Type: Random - Background I/O mix: Reads and writes -- Application to start: LibreOffice Writer in -http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> -Tested-by: Lee Tibbert <lee.tibbert@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- - block/bfq-sq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- - 2 files changed, 62 insertions(+), 38 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 7d27d5b3befb..f378519b6d33 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1204,6 +1204,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - return wr_or_deserves_wr; - } - -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ - static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, -@@ -1218,7 +1236,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -- bfqq->wr_start_at_switch_to_srt = jiffies; -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = -@@ -3174,24 +3204,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - --/* -- * Return the farthest future time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_greatest_from_now(void) --{ -- return jiffies + MAX_JIFFY_OFFSET; --} -- --/* -- * Return the farthest past time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_smallest_from_now(void) --{ -- return jiffies - MAX_JIFFY_OFFSET; --} -- - /** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 6fdc3b1d5bb8..f4654436cd55 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -1165,6 +1165,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - return wr_or_deserves_wr; - } - -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ - static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, -@@ -1179,7 +1197,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -- bfqq->wr_start_at_switch_to_srt = jiffies; -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = -@@ -3067,24 +3097,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - --/* -- * Return the farthest future time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_greatest_from_now(void) --{ -- return jiffies + MAX_JIFFY_OFFSET; --} -- --/* -- * Return the farthest past time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_smallest_from_now(void) --{ -- return jiffies - MAX_JIFFY_OFFSET; --} -- - /** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. - -From 9dbea44b6f721baeff35b9fdf628ec55fe00e09d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 14 Sep 2017 05:12:58 -0400 -Subject: [PATCH 43/51] Fix commit "Unnest request-queue and ioc locks from - scheduler locks" - -The commit "Unnest request-queue and ioc locks from scheduler locks" -mistakenly removed the setting of the split flag in function -bfq_prepare_request. This commit puts this missing instruction back in -its place. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f378519b6d33..288078e68a2a 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -744,6 +744,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -@@ -2208,6 +2214,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); - } - - static void -@@ -4950,6 +4961,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; - - if (!bfqq) - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, - -From d4ebb2a66a23dc183792088c521f2be2193b56db Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 15 Sep 2017 01:53:51 -0400 -Subject: [PATCH 44/51] bfq-sq, bfq-mq: check and switch back to interactive wr - also on queue split - -As already explained in the message of commit "bfq-mq, bfq-sq: fix -wrong init of saved start time for weight raising", if a soft -real-time weight-raising period happens to be nested in a larger -interactive weight-raising period, then BFQ restores the interactive -weight raising at the end of the soft real-time weight raising. In -particular, BFQ checks whether the latter has ended only on request -dispatches. - -Unfortunately, the above scheme fails to restore interactive weight -raising in the following corner case: if a bfq_queue, say Q, -1) Is merged with another bfq_queue while it is in a nested soft -real-time weight-raising period. The weight-raising state of Q is -then saved, and not considered any longer until a split occurs. -2) Is split from the other bfq_queue(s) at a time instant when its -soft real-time weight raising is already finished. -On the split, while resuming the previous, soft real-time -weight-raised state of the bfq_queue Q, BFQ checks whether the -current soft real-time weight-raising period is actually over. If so, -BFQ switches weight raising off for Q, *without* checking whether the -soft real-time period was actually nested in a non-yet-finished -interactive weight-raising period. - -This commit addresses this issue by adding the above missing check in -bfq_queue splits, and restoring interactive weight raising if needed. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 29 +++++++++++++++++++++-------- - block/bfq-sq-iosched.c | 35 +++++++++++++++++++++++++++-------- - 2 files changed, 48 insertions(+), 16 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 288078e68a2a..6130a95c6497 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -716,6 +716,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ - static void - bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) -@@ -753,12 +762,20 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); -- -- bfqq->wr_coeff = 1; -+ } - } - - /* make sure weight will be updated, however we got here */ -@@ -3820,11 +3837,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { -- /* switch back to interactive wr */ -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- bfqq->last_wr_start_finish = -- bfqq->wr_start_at_switch_to_srt; -+ switch_back_to_interactive_wr(bfqq, bfqd); - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index f4654436cd55..e07d5d1c0d40 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -678,6 +678,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ - static void - bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) -@@ -705,15 +714,29 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); -- -- bfqq->wr_coeff = 1; -+ } - } - - /* make sure weight will be updated, however we got here */ -@@ -3703,11 +3726,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { -- /* switch back to interactive wr */ -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- bfqq->last_wr_start_finish = -- bfqq->wr_start_at_switch_to_srt; -+ switch_back_to_interactive_wr(bfqq, bfqd); - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; - -From 9eaec0c3a2d675763b09da81c9117a9c43bce942 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 15 Sep 2017 04:58:33 -0400 -Subject: [PATCH 45/51] bfq-sq, bfq-mq: let early-merged queues be - weight-raised on split too - -A just-created bfq_queue, say Q, may happen to be merged with another -bfq_queue on the very first invocation of the function -__bfq_insert_request. In such a case, even if Q would clearly deserve -interactive weight raising (as it has just been created), the function -bfq_add_request does not make it to be invoked for Q, and thus to -activate weight raising for Q. As a consequence, when the state of Q -is saved for a possible future restore, after a split of Q from the -other bfq_queue(s), such a state happens to be (unjustly) -non-weight-raised. Then the bfq_queue will not enjoy any weight -raising on the split, even if should still be in an interactive -weight-raising period when the split occurs. - -This commit solves this problem as follows, for a just-created -bfq_queue that is being early-merged: it stores directly, in the saved -state of the bfq_queue, the weight-raising state that would have been -assigned to the bfq_queue if not early-merged. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 28 +++++++++++++++++++++++----- - block/bfq-sq-iosched.c | 28 +++++++++++++++++++++++----- - 2 files changed, 46 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6130a95c6497..af84e506e897 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2226,10 +2226,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bic->saved_wr_coeff = bfqq->wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq))) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -@@ -4560,7 +4577,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - bfqq->allocated); - - new_bfqq->ref++; -- bfq_clear_bfqq_just_created(bfqq); - /* - * If the bic associated with the process - * issuing this request still points to bfqq -@@ -4572,6 +4588,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e07d5d1c0d40..0c48f527fe3f 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2105,10 +2105,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bic->saved_wr_coeff = bfqq->wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq))) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - -@@ -4383,10 +4400,11 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; - new_bfqq->ref++; -- bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq - -From cb05150675095cb97ab22e4955eb82e4fe2e9dbe Mon Sep 17 00:00:00 2001 -From: omcira <omcira@gmail.com> -Date: Mon, 18 Sep 2017 10:49:48 +0200 -Subject: [PATCH 46/51] bfq-sq, bfq-mq: decrease burst size when queues in - burst exit - -If many queues belonging to the same group happen to be created -shortly after each other, then the concurrent processes associated -with these queues have typically a common goal, and they get it done -as soon as possible if not hampered by device idling. Examples are -processes spawned by git grep, or by systemd during boot. As for -device idling, this mechanism is currently necessary for weight -raising to succeed in its goal: privileging I/O. In view of these -facts, BFQ does not provide the above queues with either weight -raising or device idling. - -On the other hand, a burst of queue creations may be caused also by -the start-up of a complex application. In this case, these queues need -usually to be served one after the other, and as quickly as possible, -to maximise responsiveness. Therefore, in this case the best strategy -is to weight-raise all the queues created during the burst, i.e., the -exact opposite of the strategy for the above case. - -To distinguish between the two cases, BFQ uses an empirical burst-size -threshold, found through extensive tests and monitoring of daily -usage. Only large bursts, i.e., burst with a size above this -threshold, are considered as generated by a high number of parallel -processes. In this respect, upstart-based boot proved to be rather -hard to detect as generating a large burst of queue creations, because -with upstart most of the queues created in a burst exit *before* the -next queues in the same burst are created. To address this issue, I -changed the burst-detection mechanism so as to not decrease the size -of the current burst even if one of the queues in the burst is -eliminated. - -Unfortunately, this missing decrease causes false positives on very -fast systems: on the start-up of a complex application, such as -libreoffice writer, so many queues are created, served and exited -shortly after each other, that a large burst of queue creations is -wrongly detected as occurring. These false positives just disappear if -the size of a burst is decreased when one of the queues in the burst -exits. This commit restores the missing burst-size decrease, relying -of the fact that upstart is apparently unlikely to be used on systems -running this and future versions of the kernel. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 12 +++--------- - block/bfq-sq-iosched.c | 12 +++--------- - 2 files changed, 6 insertions(+), 18 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index af84e506e897..6e413d7236ce 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4111,16 +4111,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq)) -- /* -- * The fact that this queue is being destroyed does not -- * invalidate the fact that this queue may have been -- * activated during the current burst. As a consequence, -- * although the queue does not exist anymore, and hence -- * needs to be removed from the burst list if there, -- * the burst size has not to be decremented. -- */ -+ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -+ bfqq->bfqd->burst_size--; -+ } - - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 0c48f527fe3f..93034dd7b801 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3945,16 +3945,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq)) -- /* -- * The fact that this queue is being destroyed does not -- * invalidate the fact that this queue may have been -- * activated during the current burst. As a consequence, -- * although the queue does not exist anymore, and hence -- * needs to be removed from the burst list if there, -- * the burst size has not to be decremented. -- */ -+ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -+ bfqq->bfqd->burst_size--; -+ } - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - -From 60de7307d5e3ed7f272f12c900f631bdfe114db2 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 6 Oct 2017 19:35:38 +0200 -Subject: [PATCH 47/51] bfq-sq, bfq-mq: fix unbalanced decrements of burst size -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The commit "bfq-sq, bfq-mq: decrease burst size when queues in burst -exit" introduced the decrement of burst_size on the removal of a -bfq_queue from the burst list. Unfortunately, this decrement can -happen to be performed even when burst size is already equal to 0, -because of unbalanced decrements. A description follows of the cause -of these unbalanced decrements, namely a wrong assumption, and of the -way how this wrong assumption leads to unbalanced decrements. - -The wrong assumption is that a bfq_queue can exit only if the process -associated with the bfq_queue has exited. This is false, because a -bfq_queue, say Q, may exit also as a consequence of a merge with -another bfq_queue. In this case, Q exits because the I/O of its -associated process has been redirected to another bfq_queue. - -The decrement unbalance occurs because Q may then be re-created after -a split, and added back to the current burst list, *without* -incrementing burst_size. burst_size is not incremented because Q is -not a new bfq_queue added to the burst list, but a bfq_queue only -temporarily removed from the list, and, before the commit "bfq-sq, -bfq-mq: decrease burst size when queues in burst exit", burst_size was -not decremented when Q was removed. - -This commit addresses this issue by just checking whether the exiting -bfq_queue is a merged bfq_queue, and, in that case, not decrementing -burst_size. Unfortunately, this still leaves room for unbalanced -decrements, in the following rarer case: on a split, the bfq_queue -happens to be inserted into a different burst list than that it was -removed from when merged. If this happens, the number of elements in -the new burst list becomes higher than burst_size (by one). When the -bfq_queue then exits, it is of course not in a merged state any -longer, thus burst_size is decremented, which results in an unbalanced -decrement. To handle this sporadic, unlucky case in a simple way, -this commit also checks that burst_size is larger than 0 before -decrementing it. - -Finally, this commit removes an useless, extra check: the check that -the bfq_queue is sync, performed before checking whether the bfq_queue -is in the burst list. This extra check is redundant, because only sync -bfq_queues can be inserted into the burst list. - -Reported-by: Philip Müller <philm@manjaro.org> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Philip Müller <philm@manjaro.org> -Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> -Tested-by: Lee Tibbert <lee.tibbert@gmail.com> ---- - block/bfq-mq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- - block/bfq-sq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- - 2 files changed, 114 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6e413d7236ce..816bac6cdd3d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4111,9 +4111,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -- bfqq->bfqd->burst_size--; -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; - } - - if (bfqq->bfqd) -@@ -4940,6 +4967,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 93034dd7b801..4bbd7f4c0154 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3945,9 +3945,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -- bfqq->bfqd->burst_size--; -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; - } - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -@@ -4691,6 +4718,34 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } - -From 09adbd0f46f4ba395964b35bf611b7cc3dd84b4d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 30 Oct 2017 16:50:50 +0100 -Subject: [PATCH 48/51] doc, block, bfq-mq: update max IOPS sustainable with - BFQ - -We have investigated more deeply the performance of BFQ, in terms of -number of IOPS that can be processed by the CPU when BFQ is used as -I/O scheduler. In more detail, using the script [1], we have measured -the number of IOPS reached on top of a null block device configured -with zero latency, as a function of the workload (sequential read, -sequential write, random read, random write) and of the system (we -considered desktops, laptops and embedded systems). - -Basing on the resulting figures, with this commit we update the -current, conservative IOPS range reported in BFQ documentation. In -particular, the documentation now reports, for each of three different -systems, the lowest number of IOPS obtained for that system with the -above test (namely, the value obtained with the workload leading to -the lowest IOPS). - -[1] https://github.com/Algodev-github/IOSpeed - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - Documentation/block/bfq-iosched.txt | 19 +++++++++++++------ - 1 file changed, 13 insertions(+), 6 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index dcfe15523da3..595ff7a5ff34 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -29,12 +29,19 @@ for that device, by setting low_latency to 0. See Section 3 for - details on how to configure BFQ for the desired tradeoff between - latency and throughput, or on how to maximize throughput. - --On average CPUs, the current version of BFQ can handle devices --performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a --reference, 30-50 KIOPS correspond to very high bandwidths with --sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and --to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on --multi-queue devices too. -+BFQ has a non-null overhead, which limits the maximum IOPS that the -+CPU can process for a device scheduled with BFQ. To give an idea of -+the limits on slow or average CPUs, here are BFQ limits for three -+different CPUs, on, respectively, an average laptop, an old desktop, -+and a cheap embedded system, in case full hierarchical support is -+enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or -+CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, -+CONFIG_BFQ_GROUP_IOSCHED is set for bfq): -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS -+ -+BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. - - -From be94f97b577dc587593185224a7718aa59ac43f7 Mon Sep 17 00:00:00 2001 -From: Luca Miccio <lucmiccio@gmail.com> -Date: Tue, 31 Oct 2017 09:50:11 +0100 -Subject: [PATCH 49/51] block, bfq-mq: add missing invocations of - bfqg_stats_update_io_add/remove - -bfqg_stats_update_io_add and bfqg_stats_update_io_remove are to be -invoked, respectively, when an I/O request enters and when an I/O -request exits the scheduler. Unfortunately, bfq-mq does not fully comply -with this scheme, because it does not invoke these functions for -requests that are inserted into or extracted from its priority -dispatch list. This commit fixes this mistake. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - block/bfq-mq-iosched.c | 24 +++++++++++++++++++----- - 1 file changed, 19 insertions(+), 5 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 816bac6cdd3d..fbf28804c220 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1394,7 +1394,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfqq == bfqd->in_service_queue); -- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); - - /* - * bfqq deserves to be weight-raised if: -@@ -1734,7 +1733,6 @@ static void bfq_remove_request(struct request_queue *q, - BUG_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } -- bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); - } - - static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -@@ -1879,6 +1877,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - bfqq->next_rq = rq; - - bfq_remove_request(q, next); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags); - - spin_unlock_irq(&bfqq->bfqd->lock); - end: -@@ -4077,6 +4076,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - spin_lock_irq(&bfqd->lock); - - rq = __bfq_dispatch_request(hctx); -+ if (rq && RQ_BFQQ(rq)) -+ bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), -+ rq->cmd_flags); -+ - spin_unlock_irq(&bfqd->lock); - - return rq; -@@ -4634,6 +4637,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - { - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4647,8 +4651,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else -@@ -4668,6 +4670,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - rq->rq_flags &= ~RQF_GOT; - - __bfq_insert_request(bfqd, rq); -+ /* -+ * Update bfqq, because, if a queue merge has occurred -+ * in __bfq_insert_request, then rq has been -+ * redirected into a new queue. -+ */ -+ bfqq = RQ_BFQQ(rq); - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4676,6 +4684,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - } - } - -+ if (bfqq) -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); -+ - spin_unlock_irq(&bfqd->lock); - } - -@@ -4893,8 +4904,11 @@ static void bfq_finish_request(struct request *rq) - BUG_ON(in_interrupt()); - - assert_spin_locked(&bfqd->lock); -- if (!RB_EMPTY_NODE(&rq->rb_node)) -+ if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), -+ rq->cmd_flags); -+ } - bfq_put_rq_priv_body(bfqq); - } - - -From 8659a1549d2bf241129a0f7c90429bddd9c2bc53 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 8 Nov 2017 19:07:40 +0100 -Subject: [PATCH 50/51] block, bfq-mq: update blkio stats outside the scheduler - lock - -bfq-mq invokes various blkg_*stats_* functions to update the statistics -contained in the special files blkio.bfq-mq.* in the blkio controller -groups, i.e., the I/O accounting related to the proportional-share -policy provided by bfq-mq. The execution of these functions takes a -considerable percentage, about 40%, of the total per-request execution -time of bfq-mq (i.e., of the sum of the execution time of all the bfq-mq -functions that have to be executed to process an I/O request from its -creation to its destruction). This reduces the request-processing -rate sustainable by bfq-mq noticeably, even on a multicore CPU. In fact, -the bfq-mq functions that invoke blkg_*stats_* functions cannot be -executed in parallel with the rest of the code of bfq-mq, because -both are executed under the same same per-device scheduler lock. - -To reduce this slowdown, this commit moves, wherever possible, the -invocation of these functions (more precisely, of the bfq-mq functions -that invoke blkg_*stats_* functions) outside the critical sections -protected by the scheduler lock. - -With this change, and with all blkio.bfq-mq.* statistics enabled, the -throughput grows, e.g., from 250 to 310 KIOPS (+25%) on an Intel -i7-4850HQ, in case of 8 threads doing random I/O in parallel on -null_blk, with the latter configured with 0 latency. We obtained the -same or higher throughput boosts, up to +30%, with other processors -(some figures are reported in the documentation). For our tests, we -used the script [1], with which our results can be easily reproduced. - -NOTE. This commit still protects the invocation of blkg_*stats_* -functions with the request_queue lock, because the group these -functions are invoked on may otherwise disappear before or while these -functions are executed. Fortunately, tests without even this lock -show, by difference, that the serialization caused by this lock has a -little impact (at most ~5% of throughput reduction). - -[1] https://github.com/Algodev-github/IOSpeed - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - Documentation/block/bfq-iosched.txt | 18 ++++-- - block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++----- - block/bfq-sched.c | 2 + - 3 files changed, 112 insertions(+), 20 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 595ff7a5ff34..c816c595082d 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -31,16 +31,22 @@ latency and throughput, or on how to maximize throughput. - - BFQ has a non-null overhead, which limits the maximum IOPS that the - CPU can process for a device scheduled with BFQ. To give an idea of --the limits on slow or average CPUs, here are BFQ limits for three --different CPUs, on, respectively, an average laptop, an old desktop, --and a cheap embedded system, in case full hierarchical support is --enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or --CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, --CONFIG_BFQ_GROUP_IOSCHED is set for bfq): -+the limits on slow or average CPUs, here are, first, the limits of -+bfq-sq for three different CPUs, on, respectively, an average laptop, -+an old desktop, and a cheap embedded system, in case full hierarchical -+support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): - - Intel i7-4850HQ: 250 KIOPS - - AMD A8-3850: 170 KIOPS - - ARM CortexTM-A53 Octa-core: 45 KIOPS - -+bfq-mq and bfq instances reach, instead, a higher sustainable -+throughput. Their limits, on the same systems as above, are, with full -+hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set -+for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): -+- Intel i7-4850HQ: 310 KIOPS -+- AMD A8-3850: 200 KIOPS -+- ARM CortexTM-A53 Octa-core: 56 KIOPS -+ - BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index fbf28804c220..ab3b83d612c2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1822,7 +1822,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfqq->next_rq = next_rq; - - bfq_log_bfqq(bfqd, bfqq, -- "requests_merged: req %p prev %p next_rq %p bfqq %p", -+ "request_merged: req %p prev %p next_rq %p bfqq %p", - req, prev, next_rq, bfqq); - - /* -@@ -2415,7 +2415,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - if (bfqq) { -- bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -@@ -3784,7 +3783,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); - } - goto keep_queue; - } -@@ -4072,16 +4070,67 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_queue *in_serv_queue, *bfqq; -+ bool waiting_rq, idle_timer_disabled; -+#endif - - spin_lock_irq(&bfqd->lock); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ - rq = __bfq_dispatch_request(hctx); -- if (rq && RQ_BFQQ(rq)) -- bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), -- rq->cmd_flags); - -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+#else -+ rq = __bfq_dispatch_request(hctx); -+#endif - spin_unlock_irq(&bfqd->lock); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfqq = rq ? RQ_BFQQ(rq) : NULL; -+ if (!idle_timer_disabled && !bfqq) -+ return rq; -+ -+ /* -+ * rq and bfqq are guaranteed to exist until this function -+ * ends, for the following reasons. First, rq can be -+ * dispatched to the device, and then can be completed and -+ * freed, only after this function ends. Second, rq cannot be -+ * merged (and thus freed because of a merge) any longer, -+ * because it has already started. Thus rq cannot be freed -+ * before this function ends, and, since rq has a reference to -+ * bfqq, the same guarantee holds for bfqq too. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(hctx->queue->queue_lock); -+ if (idle_timer_disabled) -+ /* -+ * Since the idle timer has been disabled, -+ * in_serv_queue contained some request when -+ * __bfq_dispatch_request was invoked above, which -+ * implies that rq was picked exactly from -+ * in_serv_queue. Thus in_serv_queue == bfqq, and is -+ * therefore guaranteed to exist because of the above -+ * arguments. -+ */ -+ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); -+ if (bfqq) { -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+ -+ bfqg_stats_update_avg_queue_size(bfqg); -+ bfqg_stats_set_start_empty_time(bfqg); -+ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); -+ } -+ spin_unlock_irq(hctx->queue->queue_lock); -+#endif -+ - return rq; - } - -@@ -4200,7 +4249,6 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); - spin_unlock_irqrestore(&bfqd->lock, flags); -@@ -4554,7 +4602,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); - - /* - * The queue is not empty, because a new request just -@@ -4569,9 +4616,11 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - } - } - --static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) -+/* returns true if it causes the idle timer to be disabled */ -+static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ bool waiting, idle_timer_disabled = false; - BUG_ON(!bfqq); - - assert_spin_locked(&bfqd->lock); -@@ -4624,12 +4673,16 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - } - } - -+ waiting = bfqq && bfq_bfqq_wait_request(bfqq); - bfq_add_request(rq); -+ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -+ -+ return idle_timer_disabled; - } - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -@@ -4638,6 +4691,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bool idle_timer_disabled = false; -+ unsigned int cmd_flags; -+#endif - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4669,13 +4726,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - -- __bfq_insert_request(bfqd, rq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred - * in __bfq_insert_request, then rq has been - * redirected into a new queue. - */ - bfqq = RQ_BFQQ(rq); -+#else -+ __bfq_insert_request(bfqd, rq); -+#endif - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4683,11 +4744,34 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } -- -- if (bfqq) -- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); -- -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* -+ * Cache cmd_flags before releasing scheduler lock, because rq -+ * may disappear afterwards (for example, because of a request -+ * merge). -+ */ -+ cmd_flags = rq->cmd_flags; -+#endif - spin_unlock_irq(&bfqd->lock); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (!bfqq) -+ return; -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instruction. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+#endif - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index e4a2553a2d2c..616c0692335a 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -949,9 +949,11 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } -+#ifndef BFQ_MQ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif -+#endif - st = bfq_entity_service_tree(&bfqq->entity); - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", - served, ((st->vtime>>10)*1000)>>12, st); - -From abdfb33a3325df55ec0261fd824ca61ddac13575 Mon Sep 17 00:00:00 2001 -From: Luca Miccio <lucmiccio@gmail.com> -Date: Wed, 8 Nov 2017 19:07:41 +0100 -Subject: [PATCH 51/51] block, bfq-sq, bfq-mq: move debug blkio stats behind - CONFIG_DEBUG_BLK_CGROUP - -BFQ (both bfq-mq and bfq-sq) currently creates, and updates, its own -instance of the whole set of blkio statistics that cfq creates. Yet, -from the comments of Tejun Heo in [1], it turned out that most of -these statistics are meant/useful only for debugging. This commit -makes BFQ create the latter, debugging statistics only if the option -CONFIG_DEBUG_BLK_CGROUP is set. - -By doing so, this commit also enables BFQ to enjoy a high perfomance -boost. The reason is that, if CONFIG_DEBUG_BLK_CGROUP is not set, then -BFQ has to update far fewer statistics, and, in particular, not the -heaviest to update. To give an idea of the benefits, if -CONFIG_DEBUG_BLK_CGROUP is not set, then, on an Intel i7-4850HQ, and -with 8 threads doing random I/O in parallel on null_blk (configured -with 0 latency), the throughput of bfq-mq grows from 310 to 400 KIOPS -(+30%). We have measured similar or even much higher boosts with other -CPUs: e.g., +45% with an ARM CortexTM-A53 Octa-core. Our results have -been obtained and can be reproduced very easily with the script in [1]. - -[1] https://www.spinics.net/lists/linux-block/msg18943.html - -Reported-by: Tejun Heo <tj@kernel.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 59 ++++++++++--- - block/bfq-cgroup-included.c | 163 ++++++++++++++++++++---------------- - block/bfq-mq-iosched.c | 14 ++-- - block/bfq-mq.h | 4 +- - block/bfq.h | 4 +- - 5 files changed, 147 insertions(+), 97 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index c816c595082d..30ef2dba85ad 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -29,24 +29,41 @@ for that device, by setting low_latency to 0. See Section 3 for - details on how to configure BFQ for the desired tradeoff between - latency and throughput, or on how to maximize throughput. - --BFQ has a non-null overhead, which limits the maximum IOPS that the --CPU can process for a device scheduled with BFQ. To give an idea of --the limits on slow or average CPUs, here are, first, the limits of --bfq-sq for three different CPUs, on, respectively, an average laptop, -+BFQ has a non-null overhead, which limits the maximum IOPS that a CPU -+can process for a device scheduled with BFQ. To give an idea of the -+limits on slow or average CPUs, here are, first, the limits of bfq-mq -+and bfq for three different CPUs, on, respectively, an average laptop, - an old desktop, and a cheap embedded system, in case full hierarchical --support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): --- Intel i7-4850HQ: 250 KIOPS --- AMD A8-3850: 170 KIOPS --- ARM CortexTM-A53 Octa-core: 45 KIOPS -- --bfq-mq and bfq instances reach, instead, a higher sustainable --throughput. Their limits, on the same systems as above, are, with full --hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set --for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): -+support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for -+bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but -+CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): -+- Intel i7-4850HQ: 400 KIOPS -+- AMD A8-3850: 250 KIOPS -+- ARM CortexTM-A53 Octa-core: 80 KIOPS -+ -+As for bfq-sq, it cannot reach the above IOPS, because of the -+inherent, lower parallelism of legacy blk and of the components within -+it (including bfq-sq itself). In particular, results with -+CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits -+reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however -+provide a lower bound to the limits of bfq-sq. -+ -+Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and -+of course full hierarchical support is enabled), then the sustainable -+throughput with bfq-mq and bfq decreases, because all blkio.bfq* -+statistics are created and updated (Section 4-2). For bfq-mq and bfq, -+this leads to the following maximum sustainable throughputs, on the -+same systems as above: - - Intel i7-4850HQ: 310 KIOPS - - AMD A8-3850: 200 KIOPS - - ARM CortexTM-A53 Octa-core: 56 KIOPS - -+Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical -+support is enabled), then bfq-sq exhibits the following limits: -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS -+ - BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. -@@ -524,6 +541,22 @@ BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" - to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - -+As for cgroups-v1 (blkio controller), the exact set of stat files -+created, and kept up-to-date by bfq*, depends on whether -+CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all -+the stat files documented in -+Documentation/cgroup-v1/blkio-controller.txt. If, instead, -+CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files -+blkio.bfq*.io_service_bytes -+blkio.bfq*.io_service_bytes_recursive -+blkio.bfq*.io_serviced -+blkio.bfq*.io_serviced_recursive -+ -+The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum -+throughput sustainable with bfq*, because updating the blkio.bfq* -+stats is rather costly, especially for some of the stats enabled by -+CONFIG_DEBUG_BLK_CGROUP. -+ - Parameters to set - ----------------- - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 631e53d9150d..562b0ce581a7 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -15,7 +15,7 @@ - * file. - */ - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - - /* bfqg stats flags */ - enum bfqg_stats_flags { -@@ -155,6 +155,63 @@ static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) - bfqg_stats_update_group_wait_time(stats); - } - -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time - start_time); -+} -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq; - - /* -@@ -247,44 +304,10 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - } - #endif - --static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -- struct bfq_queue *bfqq, -- unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.queued, op, 1); -- bfqg_stats_end_empty_time(&bfqg->stats); -- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); --} -- --static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.queued, op, -1); --} -- --static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.merged, op, 1); --} -- --static void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, -- unsigned int op) --{ -- struct bfqg_stats *stats = &bfqg->stats; -- unsigned long long now = sched_clock(); -- -- if (time_after64(now, io_start_time)) -- blkg_rwstat_add(&stats->service_time, op, -- now - io_start_time); -- if (time_after64(io_start_time, start_time)) -- blkg_rwstat_add(&stats->wait_time, op, -- io_start_time - start_time); --} -- - /* @stats = 0 */ - static void bfqg_stats_reset(struct bfqg_stats *stats) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); -@@ -296,6 +319,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -+#endif - } - - /* @to += @from */ -@@ -304,6 +328,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - if (!to || !from) - return; - -+#ifdef CONFIG_DEBUG_BLK_CGROUP - /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); -@@ -316,6 +341,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+#endif - } - - /* -@@ -367,6 +393,7 @@ static void bfq_init_entity(struct bfq_entity *entity, - - static void bfqg_stats_exit(struct bfqg_stats *stats) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); -@@ -378,10 +405,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); -+#endif - } - - static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || -@@ -396,6 +425,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - bfqg_stats_exit(stats); - return -ENOMEM; - } -+#endif - - return 0; - } -@@ -1003,6 +1033,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, - return bfq_io_set_weight_legacy(of_css(of), NULL, weight); - } - -+#ifdef CONFIG_DEBUG_BLK_CGROUP - static int bfqg_print_stat(struct seq_file *sf, void *v) - { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -@@ -1108,6 +1139,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) - 0, false); - return 0; - } -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - - static struct bfq_group * - bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -@@ -1137,15 +1169,6 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - /* statistics, covers only the tasks in the bfqg */ - { -- .name = BFQ_CGROUP_FNAME(time), -- .private = offsetof(struct bfq_group, stats.time), -- .seq_show = bfqg_print_stat, -- }, -- { -- .name = BFQ_CGROUP_FNAME(sectors), -- .seq_show = bfqg_print_stat_sectors, -- }, -- { - .name = BFQ_CGROUP_FNAME(io_service_bytes), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, -@@ -1155,6 +1178,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, - }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors), -+ .seq_show = bfqg_print_stat_sectors, -+ }, - { - .name = BFQ_CGROUP_FNAME(io_service_time), - .private = offsetof(struct bfq_group, stats.service_time), -@@ -1175,18 +1208,10 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - - /* the same statictics which cover the bfqg and its descendants */ - { -- .name = BFQ_CGROUP_FNAME(time_recursive), -- .private = offsetof(struct bfq_group, stats.time), -- .seq_show = bfqg_print_stat_recursive, -- }, -- { -- .name = BFQ_CGROUP_FNAME(sectors_recursive), -- .seq_show = bfqg_print_stat_sectors_recursive, -- }, -- { - .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, -@@ -1196,6 +1221,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, - }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time_recursive), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, - { - .name = BFQ_CGROUP_FNAME(io_service_time_recursive), - .private = offsetof(struct bfq_group, stats.service_time), -@@ -1240,6 +1275,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - { } /* terminate */ - }; - -@@ -1257,25 +1293,6 @@ static struct cftype bfq_blkg_files[] = { - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - --static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -- struct bfq_queue *bfqq, unsigned int op) { } --static inline void --bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } --static inline void --bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } --static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, -- unsigned int op) { } --static inline void --bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -- struct bfq_group *curr_bfqg) { } --static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } --static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } --static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -- - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) {} - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index ab3b83d612c2..0c09609a6099 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4070,14 +4070,14 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - struct bfq_queue *in_serv_queue, *bfqq; - bool waiting_rq, idle_timer_disabled; - #endif - - spin_lock_irq(&bfqd->lock); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - -@@ -4091,7 +4091,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - #endif - spin_unlock_irq(&bfqd->lock); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bfqq = rq ? RQ_BFQQ(rq) : NULL; - if (!idle_timer_disabled && !bfqq) - return rq; -@@ -4691,7 +4691,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bool idle_timer_disabled = false; - unsigned int cmd_flags; - #endif -@@ -4726,7 +4726,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred -@@ -4744,7 +4744,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* - * Cache cmd_flags before releasing scheduler lock, because rq - * may disappear afterwards (for example, because of a request -@@ -4753,7 +4753,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - cmd_flags = rq->cmd_flags; - #endif - spin_unlock_irq(&bfqd->lock); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - if (!bfqq) - return; - /* -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 7ed2cc29be57..1cb05bb853d2 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -784,7 +784,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -812,7 +812,7 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; --#endif -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ - }; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -diff --git a/block/bfq.h b/block/bfq.h -index 15d326f466b7..47cd4d5a8c32 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -791,7 +791,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -819,7 +819,7 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; --#endif -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ - }; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED diff --git a/sys-kernel/linux-image-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch deleted file mode 100644 index 104325d6..00000000 --- a/sys-kernel/linux-image-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch +++ /dev/null @@ -1,4611 +0,0 @@ -From 7bd365a925748767d7ed807e5498f90bae0ebc25 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 14 Nov 2017 08:28:45 +0100 -Subject: [PATCH 01/23] block, bfq-mq: turn BUG_ON on request-size into WARN_ON - -BFQ has many checks of internal and external consistency. One of them -checks that an I/O request has still sectors to serve, if it happens -to be retired without being served. If the request has no sector to -serve, a BUG_ON signals the failure and causes the kernel to -terminate. Yet, from a crash report by a user [1], this condition may -happen to hold, in apparently correct functioning, for I/O with a -CD/DVD. - -To address this issue, this commit turns the above BUG_ON into a -WARN_ON. This commit also adds a companion WARN_ON on request -insertion into the scheduler. - -[1] https://groups.google.com/d/msg/bfq-iosched/DDOTJBroBa4/VyU1zUFtCgAJ - -Reported-by: Alexandre Frade <admfrade@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 0c09609a6099..0fc757ae7a42 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1540,6 +1540,8 @@ static void bfq_add_request(struct request *rq) - - BUG_ON(!RQ_BFQQ(rq)); - BUG_ON(RQ_BFQQ(rq) != bfqq); -+ WARN_ON(blk_rq_sectors(rq) == 0); -+ - elv_rb_add(&bfqq->sort_list, rq); - - /* -@@ -4962,7 +4964,7 @@ static void bfq_finish_request(struct request *rq) - rq_io_start_time_ns(rq), - rq->cmd_flags); - -- BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); - - if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - -From 1097d368a20456c88acd75b3184c68df38e8f7b8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sun, 12 Nov 2017 22:43:46 +0100 -Subject: [PATCH 02/23] block, bfq-sq, bfq-mq: consider also past I/O in soft - real-time detection - -BFQ privileges the I/O of soft real-time applications, such as video -players, to guarantee to these application a high bandwidth and a low -latency. In this respect, it is not easy to correctly detect when an -application is soft real-time. A particularly nasty false positive is -that of an I/O-bound application that occasionally happens to meet all -requirements to be deemed as soft real-time. After being detected as -soft real-time, such an application monopolizes the device. Fortunately, -BFQ will realize soon that the application is actually not soft -real-time and suspend every privilege. Yet, the application may happen -again to be wrongly detected as soft real-time, and so on. - -As highlighted by our tests, this problem causes BFQ to occasionally -fail to guarantee a high responsiveness, in the presence of heavy -background I/O workloads. The reason is that the background workload -happens to be detected as soft real-time, more or less frequently, -during the execution of the interactive task under test. To give an -idea, because of this problem, Libreoffice Writer occasionally takes 8 -seconds, instead of 3, to start up, if there are sequential reads and -writes in the background, on a Kingston SSDNow V300. - -This commit addresses this issue by leveraging the following facts. - -The reason why some applications are detected as soft real-time despite -all BFQ checks to avoid false positives, is simply that, during high -CPU or storage-device load, I/O-bound applications may happen to do -I/O slowly enough to meet all soft real-time requirements, and pass -all BFQ extra checks. Yet, this happens only for limited time periods: -slow-speed time intervals are usually interspersed between other time -intervals during which these applications do I/O at a very high speed. -To exploit these facts, this commit introduces a little change, in the -detection of soft real-time behavior, to systematically consider also -the recent past: the higher the speed was in the recent past, the -later next I/O should arrive for the application to be considered as -soft real-time. At the beginning of a slow-speed interval, the minimum -arrival time allowed for the next I/O usually happens to still be so -high, to fall *after* the end of the slow-speed period itself. As a -consequence, the application does not risk to be deemed as soft -real-time during the slow-speed interval. Then, during the next -high-speed interval, the application cannot, evidently, be deemed as -soft real-time (exactly because of its speed), and so on. - -This extra filtering proved to be rather effective: in the above test, -the frequency of false positives became so low that the start-up time -was 3 seconds in all iterations (apart from occasional outliers, -caused by page-cache-management issues, which are out of the scope of -this commit, and cannot be solved by an I/O scheduler). - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> ---- - block/bfq-mq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- - block/bfq-sq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- - 2 files changed, 162 insertions(+), 68 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 0fc757ae7a42..4d06d900f45e 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -3201,37 +3201,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * -- * Unfortunately, even a greedy application may happen to behave in an -- * isochronous way if the CPU load is high. In fact, the application may -- * stop issuing requests while the CPUs are busy serving other processes, -- * then restart, then stop again for a while, and so on. In addition, if -- * the disk achieves a low enough throughput with the request pattern -- * issued by the application (e.g., because the request pattern is random -- * and/or the device is slow), then the application may meet the above -- * bandwidth requirement too. To prevent such a greedy application to be -- * deemed as soft real-time, a further rule is used in the computation of -- * soft_rt_next_start: soft_rt_next_start must be higher than the current -- * time plus the maximum time for which the arrival of a request is waited -- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -- * This filters out greedy applications, as the latter issue instead their -- * next request as soon as possible after the last one has been completed -- * (in contrast, when a batch of requests is completed, a soft real-time -- * application spends some time processing data). -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. - * -- * Unfortunately, the last filter may easily generate false positives if -- * only bfqd->bfq_slice_idle is used as a reference time interval and one -- * or both the following cases occur: -- * 1) HZ is so low that the duration of a jiffy is comparable to or higher -- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -- * HZ=100. -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. -- * To address this issue, we do not use as a reference time interval just -- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -- * particular we add the minimum number of jiffies for which the filter -- * seems to be quite precise also in embedded systems and KVM/QEMU virtual -- * machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. - */ - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -@@ -3243,10 +3284,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - -- return max(bfqq->last_idle_bklogged + -- HZ * bfqq->service_from_backlogged / -- bfqd->bfq_wr_max_softrt_rate, -- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /** -@@ -4395,10 +4437,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->split_time = bfq_smallest_from_now(); - - /* -- * Set to the value for which bfqq will not be deemed as -- * soft rt when it becomes backlogged. -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. - */ -- bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ bfqq->soft_rt_next_start = jiffies; - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 4bbd7f4c0154..987dc255c82c 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3089,37 +3089,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * -- * Unfortunately, even a greedy application may happen to behave in an -- * isochronous way if the CPU load is high. In fact, the application may -- * stop issuing requests while the CPUs are busy serving other processes, -- * then restart, then stop again for a while, and so on. In addition, if -- * the disk achieves a low enough throughput with the request pattern -- * issued by the application (e.g., because the request pattern is random -- * and/or the device is slow), then the application may meet the above -- * bandwidth requirement too. To prevent such a greedy application to be -- * deemed as soft real-time, a further rule is used in the computation of -- * soft_rt_next_start: soft_rt_next_start must be higher than the current -- * time plus the maximum time for which the arrival of a request is waited -- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -- * This filters out greedy applications, as the latter issue instead their -- * next request as soon as possible after the last one has been completed -- * (in contrast, when a batch of requests is completed, a soft real-time -- * application spends some time processing data). -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. - * -- * Unfortunately, the last filter may easily generate false positives if -- * only bfqd->bfq_slice_idle is used as a reference time interval and one -- * or both the following cases occur: -- * 1) HZ is so low that the duration of a jiffy is comparable to or higher -- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -- * HZ=100. -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. -- * To address this issue, we do not use as a reference time interval just -- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -- * particular we add the minimum number of jiffies for which the filter -- * seems to be quite precise also in embedded systems and KVM/QEMU virtual -- * machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. - */ - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -@@ -3131,10 +3172,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - -- return max(bfqq->last_idle_bklogged + -- HZ * bfqq->service_from_backlogged / -- bfqd->bfq_wr_max_softrt_rate, -- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /** -@@ -4167,10 +4209,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->split_time = bfq_smallest_from_now(); - - /* -- * Set to the value for which bfqq will not be deemed as -- * soft rt when it becomes backlogged. -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. - */ -- bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ bfqq->soft_rt_next_start = jiffies; - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; - -From 2a09b505660c81dbb80a5d68c9bc558c326d041f Mon Sep 17 00:00:00 2001 -From: Chiara Bruschi <bruschi.chiara@outlook.it> -Date: Thu, 7 Dec 2017 09:57:19 +0100 -Subject: [PATCH 03/23] block, bfq-mq: fix occurrences of request - prepare/finish methods' old names - -Commits 'b01f1fa3bb19' (Port of "blk-mq-sched: unify request prepare -methods") and 'cc10d2d7d2c1' (Port of "blk-mq-sched: unify request -finished methods") changed the old names of current bfq_prepare_request -and bfq_finish_request methods, but left them unchanged elsewhere in -the code (related comments, part of function name bfq_put_rq_priv_body). - -This commit fixes every occurrence of the old names of these methods -by changing them into the current names. - -Fixes: b01f1fa3bb19 (Port of "blk-mq-sched: unify request prepare methods") -Fixes: cc10d2d7d2c1 (Port of "blk-mq-sched: unify request finished methods") -Reviewed-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Federico Motta <federico@willer.it> -Signed-off-by: Chiara Bruschi <bruschi.chiara@outlook.it> ---- - block/bfq-mq-iosched.c | 38 +++++++++++++++++++------------------- - 1 file changed, 19 insertions(+), 19 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 4d06d900f45e..8f8d5eccb016 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4018,20 +4018,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - /* - * TESTING: reset DISP_LIST flag, because: 1) - * this rq this request has passed through -- * get_rq_private, 2) then it will have -- * put_rq_private invoked on it, and 3) in -- * put_rq_private we use this flag to check -- * that put_rq_private is not invoked on -- * requests for which get_rq_private has been -- * invoked. -+ * bfq_prepare_request, 2) then it will have -+ * bfq_finish_request invoked on it, and 3) in -+ * bfq_finish_request we use this flag to check -+ * that bfq_finish_request is not invoked on -+ * requests for which bfq_prepare_request has -+ * been invoked. - */ - rq->rq_flags &= ~RQF_DISP_LIST; - goto inc_in_driver_start_rq; - } - - /* -- * We exploit the put_rq_private hook to decrement -- * rq_in_driver, but put_rq_private will not be -+ * We exploit the bfq_finish_request hook to decrement -+ * rq_in_driver, but bfq_finish_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, -@@ -4040,14 +4040,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * bfq_schedule_dispatch to be invoked uselessly. - * - * As for implementing an exact solution, the -- * put_request hook, if defined, is probably invoked -- * also on this request. So, by exploiting this hook, -- * we could 1) increment rq_in_driver here, and 2) -- * decrement it in put_request. Such a solution would -- * let the value of the counter be always accurate, -- * but it would entail using an extra interface -- * function. This cost seems higher than the benefit, -- * being the frequency of non-elevator-private -+ * bfq_finish_request hook, if defined, is probably -+ * invoked also on this request. So, by exploiting -+ * this hook, we could 1) increment rq_in_driver here, -+ * and 2) decrement it in bfq_finish_request. Such a -+ * solution would let the value of the counter be -+ * always accurate, but it would entail using an extra -+ * interface function. This cost seems higher than the -+ * benefit, being the frequency of non-elevator-private - * requests very low. - */ - goto start_rq; -@@ -4963,7 +4963,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - } - } - --static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) -+static void bfq_finish_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "put_request_body: allocated %d", bfqq->allocated); -@@ -5019,7 +5019,7 @@ static void bfq_finish_request(struct request *rq) - spin_lock_irqsave(&bfqd->lock, flags); - - bfq_completed_request(bfqq, bfqd); -- bfq_put_rq_priv_body(bfqq); -+ bfq_finish_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { -@@ -5042,7 +5042,7 @@ static void bfq_finish_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } -- bfq_put_rq_priv_body(bfqq); -+ bfq_finish_request_body(bfqq); - } - - rq->elv.priv[0] = NULL; - -From 4df19943c3a767df453abea3d2ac3433c3326ce0 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 16 Nov 2017 18:38:13 +0100 -Subject: [PATCH 04/23] block, bfq-sq, bfq-mq: add missing rq_pos_tree update - on rq removal - -If two processes do I/O close to each other, then BFQ merges the -bfq_queues associated with these processes, to get a more sequential -I/O, and thus a higher throughput. In this respect, to detect whether -two processes are doing I/O close to each other, BFQ keeps a list of -the head-of-line I/O requests of all active bfq_queues. The list is -ordered by initial sectors, and implemented through a red-black tree -(rq_pos_tree). - -Unfortunately, the update of the rq_pos_tree was incomplete, because -the tree was not updated on the removal of the head-of-line I/O -request of a bfq_queue, in case the queue did not remain empty. This -commit adds the missing update. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> ---- - block/bfq-mq-iosched.c | 3 +++ - block/bfq-sq-iosched.c | 3 +++ - 2 files changed, 6 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8f8d5eccb016..603191c9008f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1729,6 +1729,9 @@ static void bfq_remove_request(struct request_queue *q, - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); - } - - if (rq->cmd_flags & REQ_META) { -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 987dc255c82c..ea90ace79e49 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -1669,6 +1669,9 @@ static void bfq_remove_request(struct request *rq) - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); - } - - if (rq->cmd_flags & REQ_META) { - -From b844e345140aaea957d84a21d2aa67588b020cd5 Mon Sep 17 00:00:00 2001 -From: Angelo Ruocco <angeloruocco90@gmail.com> -Date: Mon, 18 Dec 2017 08:28:08 +0100 -Subject: [PATCH 05/23] block, bfq-sq, bfq-mq: check low_latency flag in - bfq_bfqq_save_state() - -A just-created bfq_queue will certainly be deemed as interactive on -the arrival of its first I/O request, if the low_latency flag is -set. Yet, if the queue is merged with another queue on the arrival of -its first I/O request, it will not have the chance to be flagged as -interactive. Nevertheless, if the queue is then split soon enough, it -has to be flagged as interactive after the split. - -To handle this early-merge scenario correctly, BFQ saves the state of -the queue, on the merge, as if the latter had already been deemed -interactive. So, if the queue is split soon, it will get -weight-raised, because the previous state of the queue is resumed on -the split. - -Unfortunately, in the act of saving the state of the newly-created -queue, BFQ doesn't check whether the low_latency flag is set, and this -causes early-merged queues to be then weight-raised, on queue splits, -even if low_latency is off. This commit addresses this problem by -adding the missing check. - -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 3 ++- - block/bfq-sq-iosched.c | 3 ++- - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 603191c9008f..ff9776c8836a 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2231,7 +2231,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - if (unlikely(bfq_bfqq_just_created(bfqq) && -- !bfq_bfqq_in_large_burst(bfqq))) { -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { - /* - * bfqq being merged ritgh after being created: bfqq - * would have deserved interactive weight raising, but -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index ea90ace79e49..3a2d764e760c 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2109,7 +2109,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - if (unlikely(bfq_bfqq_just_created(bfqq) && -- !bfq_bfqq_in_large_burst(bfqq))) { -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { - /* - * bfqq being merged ritgh after being created: bfqq - * would have deserved interactive weight raising, but - -From 4cc6896fe1de2e0b4de151a6e70658f10b9ec2fa Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 27 Oct 2017 11:12:14 +0200 -Subject: [PATCH 06/23] block, bfq-sq, bfq-mq: let a queue be merged only - shortly after starting I/O - -In BFQ and CFQ, two processes are said to be cooperating if they do -I/O in such a way that the union of their I/O requests yields a -sequential I/O pattern. To get such a sequential I/O pattern out of -the non-sequential pattern of each cooperating process, BFQ and CFQ -merge the queues associated with these processes. In more detail, -cooperating processes, and thus their associated queues, usually -start, or restart, to do I/O shortly after each other. This is the -case, e.g., for the I/O threads of KVM/QEMU and of the dump -utility. Basing on this assumption, this commit allows a bfq_queue to -be merged only during a short time interval (100ms) after it starts, -or re-starts, to do I/O. This filtering provides two important -benefits. - -First, it greatly reduces the probability that two non-cooperating -processes have their queues merged by mistake, if they just happen to -do I/O close to each other for a short time interval. These spurious -merges cause loss of service guarantees. A low-weight bfq_queue may -unjustly get more than its expected share of the throughput: if such a -low-weight queue is merged with a high-weight queue, then the I/O for -the low-weight queue is served as if the queue had a high weight. This -may damage other high-weight queues unexpectedly. For instance, -because of this issue, lxterminal occasionally took 7.5 seconds to -start, instead of 6.5 seconds, when some sequential readers and -writers did I/O in the background on a FUJITSU MHX2300BT HDD. The -reason is that the bfq_queues associated with some of the readers or -the writers were merged with the high-weight queues of some processes -that had to do some urgent but little I/O. The readers then exploited -the inherited high weight for all or most of their I/O, during the -start-up of terminal. The filtering introduced by this commit -eliminated any outlier caused by spurious queue merges in our start-up -time tests. - -This filtering also provides a little boost of the throughput -sustainable by BFQ: 3-4%, depending on the CPU. The reason is that, -once a bfq_queue cannot be merged any longer, this commit makes BFQ -stop updating the data needed to handle merging for the queue. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> ---- - block/bfq-mq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- - block/bfq-mq.h | 1 + - block/bfq-sched.c | 4 ++++ - block/bfq-sq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- - block/bfq.h | 2 ++ - 5 files changed, 113 insertions(+), 22 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index ff9776c8836a..8b17b25a3c30 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -119,6 +119,20 @@ static const int bfq_async_charge_factor = 10; - /* Default timeout values, in jiffies, approximating CFQ defaults. */ - static const int bfq_timeout = (HZ / 8); - -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -389,6 +403,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - return bfqq; - } - -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct rb_node **p, *parent; -@@ -399,6 +420,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqq->pos_root = NULL; - } - -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) -@@ -2081,6 +2110,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) - { -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] too late for bfq%d to be merged", -+ __func__, new_bfqq->pid); -+ return false; -+ } -+ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; -@@ -2149,6 +2185,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - { - struct bfq_queue *in_service_bfqq, *new_bfqq; - -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -@@ -3338,17 +3391,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - -- /* -- * Increase service_from_backlogged before next statement, -- * because the possible next invocation of -- * bfq_bfqq_charge_time would likely inflate -- * entity->service. In contrast, service_from_backlogged must -- * contain real service, to enable the soft real-time -- * heuristic to correctly compute the bandwidth consumed by -- * bfqq. -- */ -- bfqq->service_from_backlogged += entity->service; -- - /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 1cb05bb853d2..a5947b203ef2 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -337,6 +337,7 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ - }; - - /** -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 616c0692335a..9d261dd428e4 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -939,6 +939,10 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st; - -+ if (!bfqq->service_from_backlogged) -+ bfqq->first_IO_time = jiffies; -+ -+ bfqq->service_from_backlogged += served; - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 3a2d764e760c..cd00a41ca35d 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -113,6 +113,20 @@ static const int bfq_async_charge_factor = 10; - /* Default timeout values, in jiffies, approximating CFQ defaults. */ - static const int bfq_timeout = (HZ / 8); - -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -351,6 +365,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - return bfqq; - } - -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct rb_node **p, *parent; -@@ -361,6 +382,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqq->pos_root = NULL; - } - -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) -@@ -1960,6 +1989,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) - { -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] too late for bfq%d to be merged", -+ __func__, new_bfqq->pid); -+ return false; -+ } -+ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; -@@ -2028,6 +2064,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - { - struct bfq_queue *in_service_bfqq, *new_bfqq; - -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -@@ -3226,17 +3279,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - -- /* -- * Increase service_from_backlogged before next statement, -- * because the possible next invocation of -- * bfq_bfqq_charge_time would likely inflate -- * entity->service. In contrast, service_from_backlogged must -- * contain real service, to enable the soft real-time -- * heuristic to correctly compute the bandwidth consumed by -- * bfqq. -- */ -- bfqq->service_from_backlogged += entity->service; -- - /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service -diff --git a/block/bfq.h b/block/bfq.h -index 47cd4d5a8c32..59539adc00a5 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -329,6 +329,8 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ - }; - - /** - -From 157f39c43ab182280634cd4f6335d0187b3741a0 Mon Sep 17 00:00:00 2001 -From: Angelo Ruocco <angeloruocco90@gmail.com> -Date: Mon, 11 Dec 2017 14:19:54 +0100 -Subject: [PATCH 07/23] block, bfq-sq, bfq-mq: remove superfluous check in - queue-merging setup - -When two or more processes do I/O in a way that the their requests are -sequential in respect to one another, BFQ merges the bfq_queues associated -with the processes. This way the overall I/O pattern becomes sequential, -and thus there is a boost in througput. -These cooperating processes usually start or restart to do I/O shortly -after each other. So, in order to avoid merging non-cooperating processes, -BFQ ensures that none of these queues has been in weight raising for too -long. - -In this respect, from commit "block, bfq-sq, bfq-mq: let a queue be merged -only shortly after being created", BFQ checks whether any queue (and not -only weight-raised ones) is doing I/O continuously from too long to be -merged. - -This new additional check makes the first one useless: a queue doing -I/O from long enough, if being weight-raised, is also a queue in -weight raising for too long to be merged. Accordingly, this commit -removes the first check. - -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.com> ---- - block/bfq-mq-iosched.c | 53 ++++---------------------------------------------- - block/bfq-sq-iosched.c | 53 ++++---------------------------------------------- - 2 files changed, 8 insertions(+), 98 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8b17b25a3c30..f5db8613a70f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2140,20 +2140,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - return true; - } - --/* -- * If this function returns true, then bfqq cannot be merged. The idea -- * is that true cooperation happens very early after processes start -- * to do I/O. Usually, late cooperations are just accidental false -- * positives. In case bfqq is weight-raised, such false positives -- * would evidently degrade latency guarantees for bfqq. -- */ --static bool wr_from_too_long(struct bfq_queue *bfqq) --{ -- return bfqq->wr_coeff > 1 && -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- msecs_to_jiffies(100)); --} -- - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2167,11 +2153,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - * -- * Weight-raised queues can be merged only if their weight-raising -- * period has just started. In fact cooperating processes are usually -- * started together. Thus, with this filter we avoid false positives -- * that would jeopardize low-latency guarantees. -- * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the -@@ -2205,15 +2186,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -- if (io_struct && wr_from_too_long(bfqq) && -- likely(bfqq != &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have looked for coop, but bfq%d wr", -- bfqq->pid); -- -- if (!io_struct || -- wr_from_too_long(bfqq) || -- unlikely(bfqq == &bfqd->oom_bfqq)) -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - - /* If there is only one backlogged queue, don't search. */ -@@ -2223,17 +2196,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- wr_from_too_long(in_service_bfqq) -- && likely(in_service_bfqq == &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have tried merge with in-service-queue, but wr"); -- -- if (!in_service_bfqq || in_service_bfqq == bfqq -- || wr_from_too_long(in_service_bfqq) || -- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -- goto check_scheduled; -- -- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -@@ -2245,21 +2209,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ --check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -- bfq_may_be_close_cooperator(bfqq, new_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have merged with bfq%d, but wr", -- new_bfqq->pid); -- -- if (new_bfqq && !wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index cd00a41ca35d..d8a358e5e284 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2019,20 +2019,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - return true; - } - --/* -- * If this function returns true, then bfqq cannot be merged. The idea -- * is that true cooperation happens very early after processes start -- * to do I/O. Usually, late cooperations are just accidental false -- * positives. In case bfqq is weight-raised, such false positives -- * would evidently degrade latency guarantees for bfqq. -- */ --static bool wr_from_too_long(struct bfq_queue *bfqq) --{ -- return bfqq->wr_coeff > 1 && -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- msecs_to_jiffies(100)); --} -- - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2046,11 +2032,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - * -- * Weight-raised queues can be merged only if their weight-raising -- * period has just started. In fact cooperating processes are usually -- * started together. Thus, with this filter we avoid false positives -- * that would jeopardize low-latency guarantees. -- * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the -@@ -2084,15 +2065,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -- if (io_struct && wr_from_too_long(bfqq) && -- likely(bfqq != &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have looked for coop, but bfq%d wr", -- bfqq->pid); -- -- if (!io_struct || -- wr_from_too_long(bfqq) || -- unlikely(bfqq == &bfqd->oom_bfqq)) -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - - /* If there is only one backlogged queue, don't search. */ -@@ -2102,17 +2075,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -- && likely(in_service_bfqq == &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have tried merge with in-service-queue, but wr"); -- -- if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -- goto check_scheduled; -- -- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -@@ -2124,21 +2088,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ --check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -- bfq_may_be_close_cooperator(bfqq, new_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have merged with bfq%d, but wr", -- new_bfqq->pid); -- -- if (new_bfqq && !wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - - -From b82eb91d87f172aba7eb5eb98e8d5e2a621adf51 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 30 Nov 2017 17:48:28 +0100 -Subject: [PATCH 08/23] block, bfq-sq, bfq-mq: increase threshold to deem I/O - as random - -If two processes do I/O close to each other, i.e., are cooperating -processes in BFQ (and CFQ'S) nomenclature, then BFQ merges their -associated bfq_queues, so as to get sequential I/O from the union of -the I/O requests of the processes, and thus reach a higher -throughput. A merged queue is then split if its I/O stops being -sequential. In this respect, BFQ deems the I/O of a bfq_queue as -(mostly) sequential only if less than 4 I/O requests are random, out -of the last 32 requests inserted into the queue. - -Unfortunately, extensive testing (with the interleaved_io benchmark of -the S suite [1], and with real applications spawning cooperating -processes) has clearly shown that, with such a low threshold, only a -rather low I/O throughput may be reached when several cooperating -processes do I/O. In particular, the outcome of each test run was -bimodal: if queue merging occurred and was stable during the test, -then the throughput was close to the peak rate of the storage device, -otherwise the throughput was arbitrarily low (usually around 1/10 of -the peak rate with a rotational device). The probability to get the -unlucky outcomes grew with the number of cooperating processes: it was -already significant with 5 processes, and close to one with 7 or more -processes. - -The cause of the low throughput in the unlucky runs was that the -merged queues containing the I/O of these cooperating processes were -soon split, because they contained more random I/O requests than those -tolerated by the 4/32 threshold, but -- that I/O would have however allowed the storage device to reach - peak throughput or almost peak throughput; -- in contrast, the I/O of these processes, if served individually - (from separate queues) yielded a rather low throughput. - -So we repeated our tests with increasing values of the threshold, -until we found the minimum value (19) for which we obtained maximum -throughput, reliably, with at least up to 9 cooperating -processes. Then we checked that the use of that higher threshold value -did not cause any regression for any other benchmark in the suite [1]. -This commit raises the threshold to such a higher value. - -[1] https://github.com/Algodev-github/S - -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f5db8613a70f..cb5f49ddecb6 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -145,7 +145,7 @@ static struct kmem_cache *bfq_pool; - #define BFQQ_SEEK_THR (sector_t)(8 * 100) - #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) - #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) - - /* Min number of samples required to perform peak-rate update */ - #define BFQ_RATE_MIN_SAMPLES 32 -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index d8a358e5e284..e1c6dc651be1 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -139,7 +139,7 @@ static struct kmem_cache *bfq_pool; - #define BFQQ_SEEK_THR (sector_t)(8 * 100) - #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) - #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) - - /* Min number of samples required to perform peak-rate update */ - #define BFQ_RATE_MIN_SAMPLES 32 - -From b739dda4e4b3a1cbbc905f86f9fbb0860b068ce7 Mon Sep 17 00:00:00 2001 -From: Chiara Bruschi <bruschi.chiara@outlook.it> -Date: Mon, 11 Dec 2017 18:55:26 +0100 -Subject: [PATCH 09/23] block, bfq-sq, bfq-mq: specify usage condition of - delta_us in bfq_log_bfqq call - -Inside the function bfq_completed_request the value of a variable -called delta_us is computed as current request completion time. -delta_us is used inside a call to the function bfq_log_bfqq as divisor -in a division operation to compute a rate value, but no check makes -sure that delta_us has non-zero value. A divisor with value 0 leads -to a division error that could result in a kernel oops (therefore -unstable/unreliable system state) and consequently cause kernel panic -if resources are unavailable after the system fault. - -This commit fixes this call to bfq_log_bfqq specifying the condition -that allows delta_us to be safely used as divisor. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Chiara Bruschi <bruschi.chiara@outlook.it> ---- - block/bfq-mq-iosched.c | 5 ++++- - block/bfq-sq-iosched.c | 5 ++++- - 2 files changed, 8 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index cb5f49ddecb6..6ce2c0789046 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4904,9 +4904,12 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, - "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -- >>BFQ_RATE_SHIFT, -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e1c6dc651be1..eff4c4edf5a0 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -4565,9 +4565,12 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - - bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -- >>BFQ_RATE_SHIFT, -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* - -From ae4310c13eca762644734d53074d8456c85e2dec Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 19 Dec 2017 12:07:12 +0100 -Subject: [PATCH 10/23] block, bfq-mq: limit tags for writes and async I/O - -Asynchronous I/O can easily starve synchronous I/O (both sync reads -and sync writes), by consuming all request tags. Similarly, storms of -synchronous writes, such as those that sync(2) may trigger, can starve -synchronous reads. In their turn, these two problems may also cause -BFQ to loose control on latency for interactive and soft real-time -applications. For example, on a PLEXTOR PX-256M5S SSD, LibreOffice -Writer takes 0.6 seconds to start if the device is idle, but it takes -more than 45 seconds (!) if there are sequential writes in the -background. - -This commit addresses this issue by limiting the maximum percentage of -tags that asynchronous I/O requests and synchronous write requests can -consume. In particular, this commit grants a higher threshold to -synchronous writes, to prevent the latter from being starved by -asynchronous I/O. - -According to the above test, LibreOffice Writer now starts in about -1.2 seconds on average, regardless of the background workload, and -apart from some rare outlier. To check this improvement, run, e.g., -sudo ./comm_startup_lat.sh bfq-mq 5 5 seq 10 "lowriter --terminate_after_init" -for the comm_startup_lat benchmark in the S suite [1]. - -[1] https://github.com/Algodev-github/S - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ - block/bfq-mq.h | 12 ++++++++ - 2 files changed, 89 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6ce2c0789046..f384f5566672 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -362,6 +362,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, - } - } - -+/* -+ * See the comments on bfq_limit_depth for the purpose of -+ * the depths set in the function. -+ */ -+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) -+{ -+ bfqd->sb_shift = bt->sb.shift; -+ -+ /* -+ * In-word depths if no bfq_queue is being weight-raised: -+ * leaving 25% of tags only for sync reads. -+ * -+ * In next formulas, right-shift the value -+ * (1U<<bfqd->sb_shift), instead of computing directly -+ * (1U<<(bfqd->sb_shift - something)), to be robust against -+ * any possible value of bfqd->sb_shift, without having to -+ * limit 'something'. -+ */ -+ /* no more than 50% of tags for async I/O */ -+ bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); -+ /* -+ * no more than 75% of tags for sync writes (25% extra tags -+ * w.r.t. async I/O, to prevent async I/O from starving sync -+ * writes) -+ */ -+ bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); -+ -+ /* -+ * In-word depths in case some bfq_queue is being weight- -+ * raised: leaving ~63% of tags for sync reads. This is the -+ * highest percentage for which, in our tests, application -+ * start-up times didn't suffer from any regression due to tag -+ * shortage. -+ */ -+ /* no more than ~18% of tags for async I/O */ -+ bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); -+ /* no more than ~37% of tags for sync writes (~20% extra tags) */ -+ bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); -+} -+ -+/* -+ * Async I/O can easily starve sync I/O (both sync reads and sync -+ * writes), by consuming all tags. Similarly, storms of sync writes, -+ * such as those that sync(2) may trigger, can starve sync reads. -+ * Limit depths of async I/O and sync writes so as to counter both -+ * problems. -+ */ -+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) -+{ -+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data); -+ struct bfq_data *bfqd = data->q->elevator->elevator_data; -+ struct sbitmap_queue *bt; -+ -+ if (op_is_sync(op) && !op_is_write(op)) -+ return; -+ -+ if (data->flags & BLK_MQ_REQ_RESERVED) { -+ if (unlikely(!tags->nr_reserved_tags)) { -+ WARN_ON_ONCE(1); -+ return; -+ } -+ bt = &tags->breserved_tags; -+ } else -+ bt = &tags->bitmap_tags; -+ -+ if (unlikely(bfqd->sb_shift != bt->sb.shift)) -+ bfq_update_depths(bfqd, bt); -+ -+ data->shallow_depth = -+ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; -+ -+ bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", -+ __func__, bfqd->wr_busy_queues, op_is_sync(op), -+ data->shallow_depth); -+} -+ - static struct bfq_queue * - bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, -@@ -5812,6 +5888,7 @@ static struct elv_fs_entry bfq_attrs[] = { - - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { -+ .limit_depth = bfq_limit_depth, - .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index a5947b203ef2..458099ee0308 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -619,6 +619,18 @@ struct bfq_data { - struct bfq_queue *bio_bfqq; - /* Extra flag used only for TESTING */ - bool bio_bfqq_set; -+ -+ /* -+ * Cached sbitmap shift, used to compute depth limits in -+ * bfq_update_depths. -+ */ -+ unsigned int sb_shift; -+ -+ /* -+ * Depth limits used in bfq_limit_depth (see comments on the -+ * function) -+ */ -+ unsigned int word_depths[2][2]; - }; - - enum bfqq_state_flags { - -From 402e5f6b59662d290ab2b3c10b0016207a63ad21 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 21 Dec 2017 15:51:39 +0100 -Subject: [PATCH 11/23] bfq-sq, bfq-mq: limit sectors served with interactive - weight raising - -To maximise responsiveness, BFQ raises the weight, and performs device -idling, for bfq_queues associated with processes deemed as -interactive. In particular, weight raising has a maximum duration, -equal to the time needed to start a large application. If a -weight-raised process goes on doing I/O beyond this maximum duration, -it loses weight-raising. - -This mechanism is evidently vulnerable to the following false -positives: I/O-bound applications that will go on doing I/O for much -longer than the duration of weight-raising. These applications have -basically no benefit from being weight-raised at the beginning of -their I/O. On the opposite end, while being weight-raised, these -applications -a) unjustly steal throughput to applications that may truly need -low latency; -b) make BFQ uselessly perform device idling; device idling results -in loss of device throughput with most flash-based storage, and may -increase latencies when used purposelessly. - -This commit adds a countermeasure to reduce both the above -problems. To introduce this countermeasure, we provide the following -extra piece of information (full details in the comments added by this -commit). During the start-up of the large application used as a -reference to set the duration of weight-raising, involved processes -transfer at most ~110K sectors each. Accordingly, a process initially -deemed as interactive has no right to be weight-raised any longer, -once transferred 110K sectors or more. - -Basing on this consideration, this commit early-ends weight-raising -for a bfq_queue if the latter happens to have received an amount of -service at least equal to 110K sectors (actually, a little bit more, -to keep a safety margin). I/O-bound applications that reach a high -throughput, such as file copy, get to this threshold much before the -allowed weight-raising period finishes. Thus this early ending of -weight-raising reduces the amount of time during which these -applications cause the problems described above. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ - block/bfq-mq.h | 5 +++ - block/bfq-sched.c | 3 ++ - block/bfq-sq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ - block/bfq.h | 5 +++ - 5 files changed, 163 insertions(+), 18 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f384f5566672..63fdd16dec3c 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -162,15 +162,17 @@ static struct kmem_cache *bfq_pool; - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. -- * In particular, R is the peak rate of the reference device (see below), -- * and T is a reference time: given the systems that are likely to be -- * installed on the reference device according to its speed class, T is -- * about the maximum time needed, under BFQ and while reading two files in -- * parallel, to load typical large applications on these systems. -- * In practice, the slower/faster the device at hand is, the more/less it -- * takes to load applications with respect to the reference device. -- * Accordingly, the longer/shorter BFQ grants weight raising to interactive -- * applications. -+ * In particular, R is the peak rate of the reference device (see -+ * below), and T is a reference time: given the systems that are -+ * likely to be installed on the reference device according to its -+ * speed class, T is about the maximum time needed, under BFQ and -+ * while reading two files in parallel, to load typical large -+ * applications on these systems (see the comments on -+ * max_service_from_wr below, for more details on how T is obtained). -+ * In practice, the slower/faster the device at hand is, the more/less -+ * it takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to -+ * interactive applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; -@@ -207,6 +209,60 @@ static int T_slow[2]; - static int T_fast[2]; - static int device_speed_thresh[2]; - -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transferred. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -@@ -1361,6 +1417,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { -+ bfqq->service_from_wr = 0; - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -@@ -3980,6 +4037,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - "back to interactive wr"); - } - } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "[%s] too much service", -+ __func__); -+ } - } - /* - * To improve latency (for this or other queues), immediately -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 458099ee0308..9a5ce1168ff5 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -331,6 +331,11 @@ struct bfq_queue { - * last transition from idle to backlogged. - */ - unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; - /* - * Value of wr start time when switching to soft rt - */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 9d261dd428e4..4e6c5232e2fb 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -942,6 +942,9 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - if (!bfqq->service_from_backlogged) - bfqq->first_IO_time = jiffies; - -+ if (bfqq->wr_coeff > 1) -+ bfqq->service_from_wr += served; -+ - bfqq->service_from_backlogged += served; - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index eff4c4edf5a0..486493aafaf8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -156,15 +156,17 @@ static struct kmem_cache *bfq_pool; - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. -- * In particular, R is the peak rate of the reference device (see below), -- * and T is a reference time: given the systems that are likely to be -- * installed on the reference device according to its speed class, T is -- * about the maximum time needed, under BFQ and while reading two files in -- * parallel, to load typical large applications on these systems. -- * In practice, the slower/faster the device at hand is, the more/less it -- * takes to load applications with respect to the reference device. -- * Accordingly, the longer/shorter BFQ grants weight raising to interactive -- * applications. -+ * In particular, R is the peak rate of the reference device (see -+ * below), and T is a reference time: given the systems that are -+ * likely to be installed on the reference device according to its -+ * speed class, T is about the maximum time needed, under BFQ and -+ * while reading two files in parallel, to load typical large -+ * applications on these systems (see the comments on -+ * max_service_from_wr below, for more details on how T is obtained). -+ * In practice, the slower/faster the device at hand is, the more/less -+ * it takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to -+ * interactive applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; -@@ -201,6 +203,60 @@ static int T_slow[2]; - static int T_fast[2]; - static int device_speed_thresh[2]; - -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transfered. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -@@ -1246,6 +1302,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { -+ bfqq->service_from_wr = 0; - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -@@ -3794,6 +3851,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - "back to interactive wr"); - } - } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "[%s] too much service", -+ __func__); -+ } - } - /* - * To improve latency (for this or other queues), immediately -diff --git a/block/bfq.h b/block/bfq.h -index 59539adc00a5..0cd7a3f251a7 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -323,6 +323,11 @@ struct bfq_queue { - * last transition from idle to backlogged. - */ - unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; - /* - * Value of wr start time when switching to soft rt - */ - -From 59efebb94b2f9bac653faf62dadb45b83bd27fa7 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 4 Jan 2018 16:29:58 +0100 -Subject: [PATCH 12/23] bfq-sq, bfq-mq: put async queues for root bfq groups - too -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -For each pair [device for which bfq is selected as I/O scheduler, -group in blkio/io], bfq maintains a corresponding bfq group. Each such -bfq group contains a set of async queues, with each async queue -created on demand, i.e., when some I/O request arrives for it. On -creation, an async queue gets an extra reference, to make sure that -the queue is not freed as long as its bfq group exists. Accordingly, -to allow the queue to be freed after the group exited, this extra -reference must released on group exit. - -The above holds also for a bfq root group, i.e., for the bfq group -corresponding to the root blkio/io root for a given device. Yet, by -mistake, the references to the existing async queues of a root group -are not released when the latter exits. This causes a memory leak when -the instance of bfq for a given device exits. In a similar vein, -bfqg_stats_xfer_dead is not executed for a root group. - -This commit fixes bfq_pd_offline so that the latter executes the above -missing operations for a root group too. - -Reported-by: Holger Hoffstätte <holger@applied-asynchrony.com> -Reported-by: Guoqing Jiang <gqjiang@suse.com> -Signed-off-by: Davide Ferrari <davideferrari8@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 8 +++++--- - 1 file changed, 5 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 562b0ce581a7..45fefb2e2d57 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -885,13 +885,13 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - - entity = bfqg->my_entity; - -- if (!entity) /* root group */ -- return; -- - #ifdef BFQ_MQ - spin_lock_irqsave(&bfqd->lock, flags); - #endif - -+ if (!entity) /* root group */ -+ goto put_async_queues; -+ - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -926,6 +926,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - BUG_ON(bfqg->sched_data.in_service_entity); - - __bfq_deactivate_entity(entity, false); -+ -+put_async_queues: - bfq_put_async_queues(bfqd, bfqg); - - #ifdef BFQ_MQ - -From 2dfbaaaf95054e2da3ededc0deb1ba5a4f589e53 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 8 Jan 2018 19:38:45 +0100 -Subject: [PATCH 13/23] bfq-sq, bfq-mq: release oom-queue ref to root group on - exit - -On scheduler init, a reference to the root group, and a reference to -its corresponding blkg are taken for the oom queue. Yet these -references are not released on scheduler exit, which prevents these -objects from be freed. This commit adds the missing reference -releases. - -Reported-by: Davide Ferrari <davideferrari8@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 3 +++ - block/bfq-sq-iosched.c | 3 +++ - 2 files changed, 6 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 63fdd16dec3c..b82c52fabf91 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5507,6 +5507,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+ /* release oom-queue reference to root group */ -+ bfqg_and_blkg_put(bfqd->root_group); -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 486493aafaf8..851af055664d 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5052,6 +5052,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+ /* release oom-queue reference to root group */ -+ bfqg_put(bfqd->root_group); -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - -From 13efe00c8292d78d223e1090a7f36426e360eb38 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 8 Jan 2018 19:40:38 +0100 -Subject: [PATCH 14/23] block, bfq-sq, bfq-mq: trace get and put of bfq groups - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 15 +++++++++++++++ - block/bfq-mq-iosched.c | 3 ++- - 2 files changed, 17 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 45fefb2e2d57..f94743fb2e7d 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -267,6 +267,8 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -+ trace_printk("bfqg %p\n", bfqg); -+ - #ifdef BFQ_MQ - bfqg->ref++; - #else -@@ -280,6 +282,9 @@ static void bfqg_put(struct bfq_group *bfqg) - bfqg->ref--; - - BUG_ON(bfqg->ref < 0); -+ trace_printk("putting bfqg %p %s\n", bfqg, -+ bfqg->ref == 0 ? "and freeing it" : ""); -+ - if (bfqg->ref == 0) - kfree(bfqg); - #else -@@ -293,6 +298,7 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) - /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ - bfqg_get(bfqg); - -+ trace_printk("getting blkg for bfqg %p\n", bfqg); - blkg_get(bfqg_to_blkg(bfqg)); - } - -@@ -300,6 +306,7 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - { - bfqg_put(bfqg); - -+ trace_printk("putting blkg for bfqg %p\n", bfqg); - blkg_put(bfqg_to_blkg(bfqg)); - } - #endif -@@ -382,6 +389,8 @@ static void bfq_init_entity(struct bfq_entity *entity, - * Make sure that bfqg and its associated blkg do not - * disappear before entity. - */ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); -+ - bfqg_and_blkg_get(bfqg); - #else - bfqg_get(bfqg); -@@ -475,6 +484,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - kfree(bfqg); - return NULL; - } -+ trace_printk("bfqg %p\n", bfqg); - - #ifdef BFQ_MQ - /* see comments in bfq_bic_update_cgroup for why refcounting */ -@@ -513,6 +523,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - static void bfq_pd_free(struct blkg_policy_data *pd) - { - struct bfq_group *bfqg = pd_to_bfqg(pd); -+ trace_printk("bfqg %p\n", bfqg); - - bfqg_stats_exit(&bfqg->stats); - #ifdef BFQ_MQ -@@ -650,6 +661,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } - #ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ - bfqg_and_blkg_put(bfqq_group(bfqq)); - #else - bfqg_put(bfqq_group(bfqq)); -@@ -658,6 +671,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - #ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); -+ - /* pin down bfqg and its associated blkg */ - bfqg_and_blkg_get(bfqg); - #else -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b82c52fabf91..d5b7a6b985d7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4385,10 +4385,11 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - -- kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); - bfqg_and_blkg_put(bfqg); - #endif -+ kmem_cache_free(bfq_pool, bfqq); - } - - static void bfq_put_cooperator(struct bfq_queue *bfqq) - -From 816b77fba966171974eb5ee25d81bc4e19eaf1b4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 10 Jan 2018 09:08:22 +0100 -Subject: [PATCH 15/23] bfq-sq, bfq-mq: compile group put for oom queue only if - BFQ_GROUP_IOSCHED is set - -Commit ("bfq-sq, bfq-mq: release oom-queue ref to root group on exit") -added a missing put of the root bfq group for the oom queue. That put -has to be, and can be, performed only if CONFIG_BFQ_GROUP_IOSCHED is -defined: the function doing the put is even not defined at all if -CONFIG_BFQ_GROUP_IOSCHED is not defined. But that commit makes that -put be invoked regardless of whether CONFIG_BFQ_GROUP_IOSCHED is -defined. This commit fixes this mistake, by making that invocation be -compiled only if CONFIG_BFQ_GROUP_IOSCHED is actually defined. - -Fixes ("block, bfq: release oom-queue ref to root group on exit") -Reported-by: Jan Alexander Steffens <jan.steffens@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index d5b7a6b985d7..2581fe0f6f2f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5508,10 +5508,10 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* release oom-queue reference to root group */ - bfqg_and_blkg_put(bfqd->root_group); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else - spin_lock_irq(&bfqd->lock); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 851af055664d..c4df156b1fb4 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5052,10 +5052,10 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* release oom-queue reference to root group */ - bfqg_put(bfqd->root_group); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); - -From 643a89c659172b2c9ae16adfe03af4e3e88e1326 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 13 Jan 2018 18:48:41 +0100 -Subject: [PATCH 16/23] block, bfq-sq, bfq-mq: remove trace_printks - -Commit ("block, bfq-sq, bfq-mq: trace get and put of bfq groups") -unwisely added some invocations of the function trace_printk, which -is inappropriate in production kernels. This commit removes those -invocations. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 10 ---------- - 1 file changed, 10 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index f94743fb2e7d..a4f8a03edfc9 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -267,8 +267,6 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -- trace_printk("bfqg %p\n", bfqg); -- - #ifdef BFQ_MQ - bfqg->ref++; - #else -@@ -282,9 +280,6 @@ static void bfqg_put(struct bfq_group *bfqg) - bfqg->ref--; - - BUG_ON(bfqg->ref < 0); -- trace_printk("putting bfqg %p %s\n", bfqg, -- bfqg->ref == 0 ? "and freeing it" : ""); -- - if (bfqg->ref == 0) - kfree(bfqg); - #else -@@ -298,7 +293,6 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) - /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ - bfqg_get(bfqg); - -- trace_printk("getting blkg for bfqg %p\n", bfqg); - blkg_get(bfqg_to_blkg(bfqg)); - } - -@@ -306,7 +300,6 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - { - bfqg_put(bfqg); - -- trace_printk("putting blkg for bfqg %p\n", bfqg); - blkg_put(bfqg_to_blkg(bfqg)); - } - #endif -@@ -484,8 +477,6 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - kfree(bfqg); - return NULL; - } -- trace_printk("bfqg %p\n", bfqg); -- - #ifdef BFQ_MQ - /* see comments in bfq_bic_update_cgroup for why refcounting */ - bfqg_get(bfqg); -@@ -523,7 +514,6 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - static void bfq_pd_free(struct blkg_policy_data *pd) - { - struct bfq_group *bfqg = pd_to_bfqg(pd); -- trace_printk("bfqg %p\n", bfqg); - - bfqg_stats_exit(&bfqg->stats); - #ifdef BFQ_MQ - -From ce050275e24fecec800f346c09d9494563e9fc8a Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 15 Jan 2018 15:07:05 +0100 -Subject: [PATCH 17/23] block, bfq-mq: add requeue-request hook - -Commit 'a6a252e64914 ("blk-mq-sched: decide how to handle flush rq via -RQF_FLUSH_SEQ")' makes all non-flush re-prepared requests for a device -be re-inserted into the active I/O scheduler for that device. As a -consequence, I/O schedulers may get the same request inserted again, -even several times, without a finish_request invoked on that request -before each re-insertion. - -This fact is the cause of the failure reported in [1]. For an I/O -scheduler, every re-insertion of the same re-prepared request is -equivalent to the insertion of a new request. For schedulers like -mq-deadline or kyber, this fact causes no harm. In contrast, it -confuses a stateful scheduler like BFQ, which keeps state for an I/O -request, until the finish_request hook is invoked on the request. In -particular, BFQ may get stuck, waiting forever for the number of -request dispatches, of the same request, to be balanced by an equal -number of request completions (while there will be one completion for -that request). In this state, BFQ may refuse to serve I/O requests -from other bfq_queues. The hang reported in [1] then follows. - -However, the above re-prepared requests undergo a requeue, thus the -requeue_request hook of the active elevator is invoked for these -requests, if set. This commit then addresses the above issue by -properly implementing the hook requeue_request in BFQ. - -[1] https://marc.info/?l=linux-block&m=151211117608676 - -Reported-by: Ivan Kozik <ivan@ludios.org> -Reported-by: Alban Browaeys <alban.browaeys@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Serena Ziviani <ziviani.serena@gmail.com> ---- - block/bfq-mq-iosched.c | 90 ++++++++++++++++++++++++++++++++++++++++---------- - 1 file changed, 73 insertions(+), 17 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 2581fe0f6f2f..bb7ccc2f1165 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4162,9 +4162,9 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * TESTING: reset DISP_LIST flag, because: 1) - * this rq this request has passed through - * bfq_prepare_request, 2) then it will have -- * bfq_finish_request invoked on it, and 3) in -- * bfq_finish_request we use this flag to check -- * that bfq_finish_request is not invoked on -+ * bfq_finish_requeue_request invoked on it, and 3) in -+ * bfq_finish_requeue_request we use this flag to check -+ * that bfq_finish_requeue_request is not invoked on - * requests for which bfq_prepare_request has - * been invoked. - */ -@@ -4173,8 +4173,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - } - - /* -- * We exploit the bfq_finish_request hook to decrement -- * rq_in_driver, but bfq_finish_request will not be -+ * We exploit the bfq_finish_requeue_request hook to decrement -+ * rq_in_driver, but bfq_finish_requeue_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, -@@ -4183,10 +4183,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * bfq_schedule_dispatch to be invoked uselessly. - * - * As for implementing an exact solution, the -- * bfq_finish_request hook, if defined, is probably -+ * bfq_finish_requeue_request hook, if defined, is probably - * invoked also on this request. So, by exploiting - * this hook, we could 1) increment rq_in_driver here, -- * and 2) decrement it in bfq_finish_request. Such a -+ * and 2) decrement it in bfq_finish_requeue_request. Such a - * solution would let the value of the counter be - * always accurate, but it would entail using an extra - * interface function. This cost seems higher than the -@@ -4878,6 +4878,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - return idle_timer_disabled; - } - -+static void bfq_prepare_request(struct request *rq, struct bio *bio); -+ - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) - { -@@ -4919,6 +4921,20 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - -+ if (!bfqq) { -+ /* -+ * This should never happen. Most likely rq is -+ * a requeued regular request, being -+ * re-inserted without being first -+ * re-prepared. Do a prepare, to avoid -+ * failure. -+ */ -+ pr_warn("Regular request associated with no queue"); -+ WARN_ON(1); -+ bfq_prepare_request(rq, rq->bio); -+ bfqq = RQ_BFQQ(rq); -+ } -+ - #if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* -@@ -5110,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - } - } - --static void bfq_finish_request_body(struct bfq_queue *bfqq) -+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "put_request_body: allocated %d", bfqq->allocated); -@@ -5120,7 +5136,13 @@ static void bfq_finish_request_body(struct bfq_queue *bfqq) - bfq_put_queue(bfqq); - } - --static void bfq_finish_request(struct request *rq) -+/* -+ * Handle either a requeue or a finish for rq. The things to do are -+ * the same in both cases: all references to rq are to be dropped. In -+ * particular, rq is considered completed from the point of view of -+ * the scheduler. -+ */ -+static void bfq_finish_requeue_request(struct request *rq) - { - struct bfq_queue *bfqq; - struct bfq_data *bfqd; -@@ -5128,11 +5150,27 @@ static void bfq_finish_request(struct request *rq) - - BUG_ON(!rq); - -- if (!rq->elv.icq) -+ bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * Requeue and finish hooks are invoked in blk-mq without -+ * checking whether the involved request is actually still -+ * referenced in the scheduler. To handle this fact, the -+ * following two checks make this function exit in case of -+ * spurious invocations, for which there is nothing to do. -+ * -+ * First, check whether rq has nothing to do with an elevator. -+ */ -+ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) - return; - -- bfqq = RQ_BFQQ(rq); -- BUG_ON(!bfqq); -+ /* -+ * rq either is not associated with any icq, or is an already -+ * requeued request that has not (yet) been re-inserted into -+ * a bfq_queue. -+ */ -+ if (!rq->elv.icq || !bfqq) -+ return; - - bic = RQ_BIC(rq); - BUG_ON(!bic); -@@ -5145,7 +5183,6 @@ static void bfq_finish_request(struct request *rq) - BUG(); - } - BUG_ON(rq->rq_flags & RQF_QUEUED); -- BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); - - bfq_log_bfqq(bfqd, bfqq, - "putting rq %p with %u sects left, STARTED %d", -@@ -5166,13 +5203,14 @@ static void bfq_finish_request(struct request *rq) - spin_lock_irqsave(&bfqd->lock, flags); - - bfq_completed_request(bfqq, bfqd); -- bfq_finish_request_body(bfqq); -+ bfq_finish_requeue_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -- * in which case we need to remove it. And we cannot -+ * in which case we need to remove it (this should -+ * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. -@@ -5189,9 +5227,26 @@ static void bfq_finish_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } -- bfq_finish_request_body(bfqq); -+ bfq_finish_requeue_request_body(bfqq); - } - -+ /* -+ * Reset private fields. In case of a requeue, this allows -+ * this function to correctly do nothing if it is spuriously -+ * invoked again on this same request (see the check at the -+ * beginning of the function). Probably, a better general -+ * design would be to prevent blk-mq from invoking the requeue -+ * or finish hooks of an elevator, for a request that is not -+ * referred by that elevator. -+ * -+ * Resetting the following fields would break the -+ * request-insertion logic if rq is re-inserted into a bfq -+ * internal queue, without a re-preparation. Here we assume -+ * that re-insertions of requeued requests, without -+ * re-preparation, can happen only for pass_through or at_head -+ * requests (which are not re-inserted into bfq internal -+ * queues). -+ */ - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - } -@@ -5960,7 +6015,8 @@ static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .limit_depth = bfq_limit_depth, - .prepare_request = bfq_prepare_request, -- .finish_request = bfq_finish_request, -+ .requeue_request = bfq_finish_requeue_request, -+ .finish_request = bfq_finish_requeue_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, - -From 3e4f292191cc62b3844316b9741534c3f1b36f0a Mon Sep 17 00:00:00 2001 -From: Davide Paganelli <paga.david@gmail.com> -Date: Thu, 8 Feb 2018 12:19:24 +0100 -Subject: [PATCH 18/23] block, bfq-mq, bfq-sq: make log functions print names - of calling functions - -Add the macro __func__ as a parameter to the invocations of the functions -pr_crit, blk_add_trace_msg and blk_add_cgroup_trace_msg in bfq_log* -functions, in order to include automatically in the log messages -the names of the functions that call the log functions. -The programmer can then avoid doing it. - -Signed-off-by: Davide Paganelli <paga.david@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 9 +-- - block/bfq-mq-iosched.c | 167 ++++++++++++++++++++++---------------------- - block/bfq-mq.h | 33 ++++----- - block/bfq-sched.c | 54 +++++++------- - block/bfq-sq-iosched.c | 134 +++++++++++++++++------------------ - block/bfq.h | 33 ++++----- - 6 files changed, 214 insertions(+), 216 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index a4f8a03edfc9..613f154e9da5 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -382,7 +382,8 @@ static void bfq_init_entity(struct bfq_entity *entity, - * Make sure that bfqg and its associated blkg do not - * disappear before entity. - */ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", -+ bfqg); - - bfqg_and_blkg_get(bfqg); - #else -@@ -651,7 +652,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } - #ifdef BFQ_MQ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); - - bfqg_and_blkg_put(bfqq_group(bfqq)); - #else -@@ -661,7 +662,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - #ifdef BFQ_MQ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); - - /* pin down bfqg and its associated blkg */ - bfqg_and_blkg_get(bfqg); -@@ -721,7 +722,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, -- "bic_change_group: %p %d", -+ "%p %d", - async_bfqq, - async_bfqq->ref); - bfq_put_queue(async_bfqq); -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bb7ccc2f1165..edc93b6af186 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -310,7 +310,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { -- bfq_log(bfqd, "schedule dispatch"); -+ bfq_log(bfqd, ""); - blk_mq_run_hw_queues(bfqd->queue, true); - } - } -@@ -489,8 +489,8 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; - -- bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", -- __func__, bfqd->wr_busy_queues, op_is_sync(op), -+ bfq_log(bfqd, "wr_busy %d sync %d depth %u", -+ bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); - } - -@@ -528,7 +528,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - if (rb_link) - *rb_link = p; - -- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ bfq_log(bfqd, "%llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - -@@ -749,7 +749,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; - } -@@ -842,7 +842,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; -- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", - new_budget); - bfq_requeue_bfqq(bfqd, bfqq, false); - } -@@ -915,8 +915,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - -@@ -929,11 +928,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfq_wr_duration(bfqd))) { - switch_back_to_interactive_wr(bfqq, bfqd); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching back to interactive"); -+ "switching back to interactive"); - } else { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching off wr (%lu + %lu < %lu)", -+ "switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - } -@@ -985,7 +984,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - -@@ -998,7 +997,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); - - /* - * We can now mark all queues in the burst list as -@@ -1170,7 +1169,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "handle_burst: late activation or different group"); -+ "late activation or different group"); - goto end; - } - -@@ -1180,7 +1179,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - goto end; - } -@@ -1686,7 +1685,7 @@ static void bfq_add_request(struct request *rq) - unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -@@ -1952,7 +1951,7 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; -- bfq_log(bfqd, "request_merge: req %p", __rq); -+ bfq_log(bfqd, "req %p", __rq); - - return ELEVATOR_FRONT_MERGE; - } -@@ -1989,7 +1988,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfqq->next_rq = next_rq; - - bfq_log_bfqq(bfqd, bfqq, -- "request_merged: req %p prev %p next_rq %p bfqq %p", -+ "req %p prev %p next_rq %p bfqq %p", - req, prev, next_rq, bfqq); - - /* -@@ -2018,7 +2017,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - goto end; - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "requests_merged: rq %p next %p bfqq %p next_bfqq %p", -+ "rq %p next %p bfqq %p next_bfqq %p", - rq, next, bfqq, next_bfqq); - - spin_lock_irq(&bfqq->bfqd->lock); -@@ -2069,10 +2068,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - */ - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "end_wr: wrais ending at %lu, rais_max_time %u", -+ "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); -- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", - bfqq->bfqd->wr_busy_queues); - } - -@@ -2245,8 +2244,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - { - if (bfq_too_late_for_merging(new_bfqq)) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] too late for bfq%d to be merged", -- __func__, new_bfqq->pid); -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); - return false; - } - -@@ -2395,8 +2394,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - } -@@ -2453,7 +2451,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - - } - -- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", - bfqd->wr_busy_queues); - - /* -@@ -2554,7 +2552,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ bfq_log_bfqq(bfqd, bfqq, "%u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); - } - -@@ -2620,10 +2618,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - - bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "set_in_service_queue, cur-budget = %d", -+ "cur-budget = %d", - bfqq->entity.budget); - } else -- bfq_log(bfqd, "set_in_service_queue: NULL"); -+ bfq_log(bfqd, "NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -2746,7 +2744,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, -- "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ "at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); - } -@@ -2766,7 +2764,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, -- "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ "only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } -@@ -2790,7 +2788,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, --"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20<<BFQ_RATE_SHIFT); -@@ -2805,14 +2803,14 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate <= bfqd->peak_rate) || - rate > 20<<BFQ_RATE_SHIFT) { - bfq_log(bfqd, -- "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ "goto reset, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, -- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ "do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -@@ -2868,7 +2866,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, -- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -@@ -2922,7 +2920,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - - if (bfqd->peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, -- "update_peak_rate: goto reset, samples %d", -+ "goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ -@@ -2943,7 +2941,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, --"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+"jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; -@@ -2969,7 +2967,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - - bfq_log(bfqd, -- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ "added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); -@@ -2985,12 +2983,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, -- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ "delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, -- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+ "samples at end %d", bfqd->peak_rate_samples); - } - - /* -@@ -3088,11 +3086,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - */ - budget = 2 * min_budget; - -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -@@ -3294,7 +3292,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - -- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ bfq_log(bfqd, "too short %u", delta_usecs); - - return slow; - } -@@ -3317,11 +3315,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfq_log(bfqd, "relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } - -- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); - - return slow; - } -@@ -3423,7 +3421,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqd, bfqq, --"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+"service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -@@ -3602,7 +3600,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "may_budget_timeout: wait_request %d left %d timeout %d", -+ "wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); -@@ -3863,11 +3861,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. - */ -- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, -- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ "wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), -@@ -3907,7 +3905,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - if (!bfqq) - goto new_queue; - -- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !bfq_bfqq_wait_request(bfqq) && -@@ -3983,14 +3981,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); - goto check_queue; - } - keep_queue: - if (bfqq) -- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); - else -- bfq_log(bfqd, "select_queue: no queue returned"); -+ bfq_log(bfqd, "no queue returned"); - - return bfqq; - } -@@ -4043,8 +4041,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* see comments on max_service_from_wr */ - bfq_bfqq_end_wr(bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "[%s] too much service", -- __func__); -+ "too much service"); - } - } - /* -@@ -4122,7 +4119,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - -- bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", -+ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", - !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); - - /* -@@ -4146,7 +4143,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq->rq_flags &= ~RQF_DISP_LIST; - - bfq_log(bfqd, -- "dispatch requests: picked %p from dispatch list", rq); -+ "picked %p from dispatch list", rq); - bfqq = RQ_BFQQ(rq); - - if (bfqq) { -@@ -4196,7 +4193,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - goto start_rq; - } - -- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) - goto exit; -@@ -4236,13 +4233,13 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq->rq_flags |= RQF_STARTED; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "dispatched %s request %p, rq_in_driver %d", -+ "%s request %p, rq_in_driver %d", - bfq_bfqq_sync(bfqq) ? "sync" : "async", - rq, - bfqd->rq_in_driver); - else - bfq_log(bfqd, -- "dispatched request %p from dispatch list, rq_in_driver %d", -+ "request %p from dispatch list, rq_in_driver %d", - rq, bfqd->rq_in_driver); - } else - bfq_log(bfqd, -@@ -4339,7 +4336,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->ref <= 0); - - if (bfqq->bfqd) -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); - - bfqq->ref--; - if (bfqq->ref) -@@ -4383,10 +4380,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - } - - if (bfqq->bfqd) -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); - bfqg_and_blkg_put(bfqg); - #endif - kmem_cache_free(bfq_pool, bfqq); -@@ -4418,7 +4415,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -@@ -4502,7 +4499,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "set_next_ioprio_data: bic_class %d prio %d class %d", -+ "bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - -@@ -4529,7 +4526,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -+ "bfqq %p %d", - bfqq, bfqq->ref); - } - -@@ -4667,14 +4664,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - * guarantee that this queue is not freed - * until its group goes away. - */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", - bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - - out: - bfqq->ref++; /* get a process reference to this queue */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); - return bfqq; - } -@@ -4733,7 +4730,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - -- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", - has_short_ttime); - - if (has_short_ttime) -@@ -4759,7 +4756,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ "has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -@@ -4818,7 +4815,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - - assert_spin_locked(&bfqd->lock); - -- bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); - - /* - * An unplug may trigger a requeue of a request from the device -@@ -4837,9 +4834,9 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - new_bfqq->allocated++; - bfqq->allocated--; - bfq_log_bfqq(bfqd, bfqq, -- "insert_request: new allocated %d", bfqq->allocated); -+ "new allocated %d", bfqq->allocated); - bfq_log_bfqq(bfqd, new_bfqq, -- "insert_request: new_bfqq new allocated %d", -+ "new_bfqq new allocated %d", - bfqq->allocated); - - new_bfqq->ref++; -@@ -4911,11 +4908,11 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - rq->rq_flags |= RQF_DISP_LIST; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "insert_request %p in disp: at_head %d", -+ "%p in disp: at_head %d", - rq, at_head); - else - bfq_log(bfqd, -- "insert_request %p in disp: at_head %d", -+ "%p in disp: at_head %d", - rq, at_head); - } else { - BUG_ON(!(rq->rq_flags & RQF_GOT)); -@@ -5033,7 +5030,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfqq->dispatched--; - - bfq_log_bfqq(bfqd, bfqq, -- "completed_requests: new disp %d, new rq_in_driver %d", -+ "new disp %d, new rq_in_driver %d", - bfqq->dispatched, bfqd->rq_in_driver); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -@@ -5061,7 +5058,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ "delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - delta_us > 0 ? - (USEC_PER_SEC* -@@ -5129,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "put_request_body: allocated %d", bfqq->allocated); -+ "allocated %d", bfqq->allocated); - BUG_ON(!bfqq->allocated); - bfqq->allocated--; - -@@ -5406,10 +5403,10 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) - - bfqq->allocated++; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "get_request: new allocated %d", bfqq->allocated); -+ "new allocated %d", bfqq->allocated); - - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -5493,7 +5490,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - idle_slice_timer); - struct bfq_queue *bfqq = bfqd->in_service_queue; - -- bfq_log(bfqd, "slice_timer expired"); -+ bfq_log(bfqd, "expired"); - - /* - * Theoretical race here: the in-service queue can be NULL or -@@ -5515,10 +5512,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - -- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ bfq_log(bfqd, "%p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); -- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; -@@ -5547,7 +5544,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - struct bfq_data *bfqd = e->elevator_data; - struct bfq_queue *bfqq, *n; - -- bfq_log(bfqd, "exit_queue: starting ..."); -+ bfq_log(bfqd, "starting ..."); - - hrtimer_cancel(&bfqd->idle_slice_timer); - -@@ -5575,7 +5572,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_unlock_irq(&bfqd->lock); - #endif - -- bfq_log(bfqd, "exit_queue: finished ..."); -+ bfq_log(bfqd, "finished ..."); - kfree(bfqd); - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 9a5ce1168ff5..e2ae11bf8f76 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -712,34 +712,34 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- bfqq_group(bfqq)->blkg_path, ##args); \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- pr_crit("%s %s " fmt "\n", \ -+ pr_crit("%s %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- bfqg->blkg_path, ##args); \ -+ bfqg->blkg_path, __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- pr_crit("%s bfq%d%c " fmt "\n", \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- pr_crit("%s bfq " fmt "\n", \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- ##args) -+ __func__, ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - -@@ -762,28 +762,29 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- bfqq_group(bfqq)->blkg_path, ##args); \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ -+ __func__, ##args);\ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) - - #endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -@@ -938,7 +939,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -946,7 +947,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - } - #endif -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 4e6c5232e2fb..ead34c30a7c2 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -119,7 +119,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_next_in_service: chose without lookup"); -+ "chose without lookup"); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -127,7 +127,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -- "update_next_in_service: chose without lookup"); -+ "chose without lookup"); - } - #endif - } -@@ -148,7 +148,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - bfqq = bfq_entity_to_bfqq(next_in_service); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_next_in_service: chosen this queue"); -+ "chosen this queue"); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -156,7 +156,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_next_in_service: chosen this entity"); -+ "chosen this entity"); - } - #endif - return parent_sched_may_change; -@@ -331,10 +331,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_finish: serv %lu, w %d", -+ "serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_finish: start %llu, finish %llu, delta %llu", -+ "start %llu, finish %llu, delta %llu", - start, finish, delta); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -342,10 +342,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_finish group: serv %lu, w %d", -+ "group: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_finish group: start %llu, finish %llu, delta %llu", -+ "group: start %llu, finish %llu, delta %llu", - start, finish, delta); - #endif - } -@@ -484,7 +484,7 @@ static void bfq_update_active_node(struct rb_node *node) - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_active_node: new min_start %llu", -+ "new min_start %llu", - ((entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -492,7 +492,7 @@ static void bfq_update_active_node(struct rb_node *node) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_active_node: new min_start %llu", -+ "new min_start %llu", - ((entity->min_start>>10)*1000)>>12); - #endif - } -@@ -620,7 +620,7 @@ static void bfq_get_entity(struct bfq_entity *entity) - - if (bfqq) { - bfqq->ref++; -- bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", - bfqq, bfqq->ref); - } - } -@@ -748,7 +748,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, - entity->on_st = false; - st->wsum -= entity->weight; - if (bfqq && !is_in_service) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -@@ -1008,7 +1008,7 @@ static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, - tot_serv_to_charge = entity->service; - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ "%lu/%u ms, %d/%d/%d sectors", - time_ms, timeout_ms, entity->service, - tot_serv_to_charge, entity->budget); - -@@ -1080,7 +1080,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_fin_time_enqueue: new queue finish %llu", -+ "new queue finish %llu", - ((entity->finish>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1088,7 +1088,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_fin_time_enqueue: new group finish %llu", -+ "new group finish %llu", - ((entity->finish>>10)*1000)>>12); - #endif - } -@@ -1098,7 +1098,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_fin_time_enqueue: queue %seligible in st %p", -+ "queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1106,7 +1106,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_fin_time_enqueue: group %seligible in st %p", -+ "group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #endif - } -@@ -1550,7 +1550,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_vtime_jump: new value %llu", -+ "new value %llu", - ((root_entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -1559,7 +1559,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_vtime_jump: new value %llu", -+ "new value %llu", - ((root_entity->min_start>>10)*1000)>>12); - } - #endif -@@ -1677,7 +1677,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__lookup_next: start %llu vtime %llu st %p", -+ "start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -1686,7 +1686,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__lookup_next: start %llu vtime %llu (%llu) st %p", -+ "start %llu vtime %llu (%llu) st %p", - ((entity->start>>10)*1000)>>12, - ((st->vtime>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); -@@ -1821,14 +1821,14 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, -- "get_next_queue: lookup in this group"); -+ "lookup in this group"); - if (!sd->next_in_service) -- pr_crit("get_next_queue: lookup in this group"); -+ pr_crit("lookup in this group"); - } else { - bfq_log_bfqg(bfqd, bfqd->root_group, -- "get_next_queue: lookup in root group"); -+ "lookup in root group"); - if (!sd->next_in_service) -- pr_crit("get_next_queue: lookup in root group"); -+ pr_crit("lookup in root group"); - } - #endif - -@@ -1903,7 +1903,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "get_next_queue: this queue, finish %llu", -+ "this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -1911,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, -- "get_next_queue: this entity, finish %llu", -+ "this entity, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - } - #endif -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index c4df156b1fb4..e49e8ac882b3 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -281,7 +281,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); - static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { -- bfq_log(bfqd, "schedule dispatch"); -+ bfq_log(bfqd, ""); - kblockd_schedule_work(&bfqd->unplug_work); - } - } -@@ -414,7 +414,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - if (rb_link) - *rb_link = p; - -- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ bfq_log(bfqd, "%llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - -@@ -635,7 +635,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; - } -@@ -728,7 +728,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; -- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", - new_budget); - bfq_requeue_bfqq(bfqd, bfqq, false); - } -@@ -800,8 +800,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - -@@ -814,11 +813,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfq_wr_duration(bfqd))) { - switch_back_to_interactive_wr(bfqq, bfqd); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching back to interactive"); -+ "switching back to interactive"); - } else { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching off wr (%lu + %lu < %lu)", -+ "switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - } -@@ -870,7 +869,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - -@@ -883,7 +882,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); - - /* - * We can now mark all queues in the burst list as -@@ -1055,7 +1054,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "handle_burst: late activation or different group"); -+ "late activation or different group"); - goto end; - } - -@@ -1065,7 +1064,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - goto end; - } -@@ -1572,7 +1571,7 @@ static void bfq_add_request(struct request *rq) - unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -@@ -1870,10 +1869,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - */ - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "end_wr: wrais ending at %lu, rais_max_time %u", -+ "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); -- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", - bfqq->bfqd->wr_busy_queues); - } - -@@ -2048,8 +2047,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - { - if (bfq_too_late_for_merging(new_bfqq)) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] too late for bfq%d to be merged", -- __func__, new_bfqq->pid); -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); - return false; - } - -@@ -2258,7 +2257,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - - } - -- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", - bfqd->wr_busy_queues); - - /* -@@ -2359,7 +2358,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ bfq_log_bfqq(bfqd, bfqq, "%u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); - } - -@@ -2427,10 +2426,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - - bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "set_in_service_queue, cur-budget = %d", -+ "cur-budget = %d", - bfqq->entity.budget); - } else -- bfq_log(bfqd, "set_in_service_queue: NULL"); -+ bfq_log(bfqd, "NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -2559,7 +2558,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, -- "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ "at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); - } -@@ -2579,7 +2578,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, -- "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ "only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } -@@ -2603,7 +2602,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, --"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20<<BFQ_RATE_SHIFT); -@@ -2618,14 +2617,14 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate <= bfqd->peak_rate) || - rate > 20<<BFQ_RATE_SHIFT) { - bfq_log(bfqd, -- "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ "goto reset, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, -- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ "do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -@@ -2681,7 +2680,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, -- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -@@ -2735,7 +2734,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - - if (bfqd->peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, -- "update_peak_rate: goto reset, samples %d", -+ "goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ -@@ -2756,7 +2755,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, --"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+"jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; -@@ -2782,7 +2781,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - - bfq_log(bfqd, -- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ "added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); -@@ -2798,12 +2797,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, -- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ "delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, -- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+ "samples at end %d", bfqd->peak_rate_samples); - } - - /* -@@ -2900,11 +2899,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - */ - budget = 2 * min_budget; - -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -@@ -3106,7 +3105,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - -- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ bfq_log(bfqd, "too short %u", delta_usecs); - - return slow; - } -@@ -3129,11 +3128,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfq_log(bfqd, "relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } - -- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); - - return slow; - } -@@ -3235,7 +3234,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqd, bfqq, --"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+"service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -@@ -3414,7 +3413,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "may_budget_timeout: wait_request %d left %d timeout %d", -+ "wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); -@@ -3675,11 +3674,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. - */ -- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, -- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ "wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), -@@ -3719,7 +3718,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - if (!bfqq) - goto new_queue; - -- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !hrtimer_active(&bfqd->idle_slice_timer) && -@@ -3797,14 +3796,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); - goto check_queue; - } - keep_queue: - if (bfqq) -- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); - else -- bfq_log(bfqd, "select_queue: no queue returned"); -+ bfq_log(bfqd, "no queue returned"); - - return bfqq; - } -@@ -3857,8 +3856,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* see comments on max_service_from_wr */ - bfq_bfqq_end_wr(bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "[%s] too much service", -- __func__); -+ "too much service"); - } - } - /* -@@ -3987,7 +3985,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - -- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) - return 0; -@@ -4021,7 +4019,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; - -- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_log_bfqq(bfqd, bfqq, "%s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); - - BUG_ON(bfqq->next_rq == NULL && -@@ -4044,7 +4042,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); - bfqq->ref--; - if (bfqq->ref) - return; -@@ -4086,7 +4084,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfqq->bfqd->burst_size--; - } - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -4120,7 +4118,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -@@ -4200,7 +4198,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "set_next_ioprio_data: bic_class %d prio %d class %d", -+ "bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - -@@ -4227,7 +4225,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -+ "bfqq %p %d", - bfqq, bfqq->ref); - } - -@@ -4362,14 +4360,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - * guarantee that this queue is not freed - * until its group goes away. - */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", - bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - - out: - bfqq->ref++; /* get a process reference to this queue */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); - return bfqq; - } -@@ -4428,7 +4426,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - -- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", - has_short_ttime); - - if (has_short_ttime) -@@ -4454,7 +4452,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ "has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -@@ -4629,7 +4627,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - -- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - delta_us > 0 ? - (USEC_PER_SEC* -@@ -4750,7 +4748,7 @@ static void bfq_put_request(struct request *rq) - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -@@ -4816,7 +4814,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: was_in_list %d " -+ "was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, -@@ -4826,12 +4824,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: marking in " -+ "marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: clearing in " -+ "clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -@@ -4888,7 +4886,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - - bfqq->allocated[rw]++; - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -4962,7 +4960,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - * case we just expire a queue too early. - */ - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_log_bfqq(bfqd, bfqq, "expired"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfq_bfqq_budget_timeout(bfqq)) -@@ -5005,10 +5003,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - -- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ bfq_log(bfqd, "%p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); -- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; -diff --git a/block/bfq.h b/block/bfq.h -index 0cd7a3f251a7..4d2fe7f77af1 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -698,37 +698,37 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -- pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- pr_crit("%s %s " fmt "\n", \ -+ pr_crit("%s %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- pr_crit("%s bfq%d%c " fmt "\n", \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- pr_crit("%s bfq " fmt "\n", \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- ##args) -+ __func__, ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - -@@ -755,31 +755,32 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ -+ __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) - - #endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -@@ -928,7 +929,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -936,7 +937,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - } - #endif - -From 673a457e8a54d1c4b66e61b1a50956ba0b8c6a60 Mon Sep 17 00:00:00 2001 -From: Davide Paganelli <paga.david@gmail.com> -Date: Thu, 8 Feb 2018 11:49:58 +0100 -Subject: [PATCH 19/23] block, bfq-mq, bfq-sq: make bfq_bfqq_expire print - expiration reason - -Improve readability of the log messages related to the expiration -reasons of the function bfq_bfqq_expire. -Change the printing of the number that represents the reason for -expiration with an actual textual description of the reason. - -Signed-off-by: Davide Paganelli <paga.david@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 10 ++++++++-- - block/bfq-sq-iosched.c | 10 ++++++++-- - 2 files changed, 16 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index edc93b6af186..9268dd47a4e5 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -133,6 +133,12 @@ static const int bfq_timeout = (HZ / 8); - */ - static const unsigned long bfq_merge_time_limit = HZ/10; - -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -3553,8 +3559,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -- reason, slow, bfqq->dispatched, -+ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason_name[reason], slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e49e8ac882b3..f95deaab49a1 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -127,6 +127,12 @@ static const int bfq_timeout = (HZ / 8); - */ - static const unsigned long bfq_merge_time_limit = HZ/10; - -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -3366,8 +3372,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -- reason, slow, bfqq->dispatched, -+ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason_name[reason], slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* - -From 62e80623fbb58367c3f667dab22fea0804001f3b Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro <melzani.alessandro@gmail.com> -Date: Mon, 26 Feb 2018 22:21:59 +0100 -Subject: [PATCH 20/23] bfq-mq: port of "block, bfq: remove batches of - confusing ifdefs" - -Commit a33801e8b473 ("block, bfq: move debug blkio stats behind -CONFIG_DEBUG_BLK_CGROUP") introduced two batches of confusing ifdefs: -one reported in [1], plus a similar one in another function. This -commit removes both batches, in the way suggested in [1]. - -[1] https://www.spinics.net/lists/linux-block/msg20043.html - -Fixes: a33801e8b473 ("block, bfq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP") - -Signed-off-by: Alessandro Melzani <melzani.alessandro@gmail.com> ---- - block/bfq-mq-iosched.c | 128 ++++++++++++++++++++++++++++--------------------- - 1 file changed, 73 insertions(+), 55 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 9268dd47a4e5..5a211620f316 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4256,35 +4256,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - --static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) --{ -- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -- struct request *rq; --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- struct bfq_queue *in_serv_queue, *bfqq; -- bool waiting_rq, idle_timer_disabled; --#endif - -- spin_lock_irq(&bfqd->lock); -- --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- in_serv_queue = bfqd->in_service_queue; -- waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -- -- rq = __bfq_dispatch_request(hctx); -- -- idle_timer_disabled = -- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -- --#else -- rq = __bfq_dispatch_request(hctx); --#endif -- spin_unlock_irq(&bfqd->lock); -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) -+{ -+ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; - --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- bfqq = rq ? RQ_BFQQ(rq) : NULL; - if (!idle_timer_disabled && !bfqq) -- return rq; -+ return; - - /* - * rq and bfqq are guaranteed to exist until this function -@@ -4299,7 +4281,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * In addition, the following queue lock guarantees that - * bfqq_group(bfqq) exists as well. - */ -- spin_lock_irq(hctx->queue->queue_lock); -+ spin_lock_irq(q->queue_lock); - if (idle_timer_disabled) - /* - * Since the idle timer has been disabled, -@@ -4318,8 +4300,35 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - bfqg_stats_set_start_empty_time(bfqg); - bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); - } -- spin_unlock_irq(hctx->queue->queue_lock); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) {} - #endif -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ struct bfq_queue *in_serv_queue; -+ bool waiting_rq, idle_timer_disabled; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ -+ rq = __bfq_dispatch_request(hctx); -+ -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, -+ idle_timer_disabled); - - return rq; - } -@@ -4881,6 +4890,38 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - return idle_timer_disabled; - } - -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) -+{ -+ if (!bfqq) -+ return; -+ -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instructions. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) {} -+#endif -+ - static void bfq_prepare_request(struct request *rq, struct bio *bio); - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -@@ -4889,10 +4930,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bool idle_timer_disabled = false; - unsigned int cmd_flags; --#endif - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4938,7 +4977,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bfqq = RQ_BFQQ(rq); - } - --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred -@@ -4946,9 +4984,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - * redirected into a new queue. - */ - bfqq = RQ_BFQQ(rq); --#else -- __bfq_insert_request(bfqd, rq); --#endif - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4956,34 +4991,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ - /* - * Cache cmd_flags before releasing scheduler lock, because rq - * may disappear afterwards (for example, because of a request - * merge). - */ - cmd_flags = rq->cmd_flags; --#endif -+ - spin_unlock_irq(&bfqd->lock); --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- if (!bfqq) -- return; -- /* -- * bfqq still exists, because it can disappear only after -- * either it is merged with another queue, or the process it -- * is associated with exits. But both actions must be taken by -- * the same process currently executing this flow of -- * instruction. -- * -- * In addition, the following queue lock guarantees that -- * bfqq_group(bfqq) exists as well. -- */ -- spin_lock_irq(q->queue_lock); -- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -- if (idle_timer_disabled) -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); -- spin_unlock_irq(q->queue_lock); --#endif -+ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, -+ cmd_flags); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, - -From 0d0d05632872b226f4fae5e56af8736a4c24bf57 Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro <melzani.alessandro@gmail.com> -Date: Mon, 26 Feb 2018 22:43:30 +0100 -Subject: [PATCH 21/23] bfq-sq, bfq-mq: port of "bfq: Use icq_to_bic() - consistently" - -Some code uses icq_to_bic() to convert an io_cq pointer to a -bfq_io_cq pointer while other code uses a direct cast. Convert -the code that uses a direct cast such that it uses icq_to_bic(). - -Signed-off-by: Alessandro Melzani <melzani.alessandro@gmail.com> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 5a211620f316..7b1269558c47 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -272,7 +272,7 @@ static const unsigned long max_service_from_wr = 120000; - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - --#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - - /** -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index f95deaab49a1..c4aff8d55fc4 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -266,7 +266,7 @@ static const unsigned long max_service_from_wr = 120000; - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - --#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - - static void bfq_schedule_dispatch(struct bfq_data *bfqd); - -From 4cb5de6add7d6ad0d25d73cb95dc871305db1522 Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro <melzani.alessandro@gmail.com> -Date: Mon, 26 Feb 2018 22:59:30 +0100 -Subject: [PATCH 22/23] bfq-sq, bfq-mq: port of "block, bfq: fix error handle - in bfq_init" - -if elv_register fail, bfq_pool should be free. - -Signed-off-by: Alessandro Melzani <melzani.alessandro@gmail.com> ---- - block/bfq-mq-iosched.c | 4 +++- - block/bfq-sq-iosched.c | 4 +++- - 2 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 7b1269558c47..964e88c2ce59 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -6129,7 +6129,7 @@ static int __init bfq_init(void) - - ret = elv_register(&iosched_bfq_mq); - if (ret) -- goto err_pol_unreg; -+ goto slab_kill; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); -@@ -6138,6 +6138,8 @@ static int __init bfq_init(void) - - return 0; - -+slab_kill: -+ bfq_slab_kill(); - err_pol_unreg: - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index c4aff8d55fc4..7f0cf1f01ffc 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5590,7 +5590,7 @@ static int __init bfq_init(void) - - ret = elv_register(&iosched_bfq); - if (ret) -- goto err_pol_unreg; -+ goto slab_kill; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); -@@ -5599,6 +5599,8 @@ static int __init bfq_init(void) - - return 0; - -+slab_kill: -+ bfq_slab_kill(); - err_pol_unreg: - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - -From 1f77c173aaa87ffb22c9f062a6449245d14311e4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 4 Apr 2018 11:28:16 +0200 -Subject: [PATCH 23/23] block, bfq-sq, bfq-mq: lower-bound the estimated peak - rate to 1 - -If a storage device handled by BFQ happens to be slower than 7.5 KB/s -for a certain amount of time (in the order of a second), then the -estimated peak rate of the device, maintained in BFQ, becomes equal to -0. The reason is the limited precision with which the rate is -represented (details on the range of representable values in the -comments introduced by this commit). This leads to a division-by-zero -error where the estimated peak rate is used as divisor. Such a type of -failure has been reported in [1]. - -This commit addresses this issue by: -1. Lower-bounding the estimated peak rate to 1 -2. Adding and improving comments on the range of rates representable - -[1] https://www.spinics.net/lists/kernel/msg2739205.html - -Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 25 ++++++++++++++++++++++++- - block/bfq-mq.h | 7 ++++++- - block/bfq-sq-iosched.c | 25 ++++++++++++++++++++++++- - block/bfq.h | 7 ++++++- - 4 files changed, 60 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 964e88c2ce59..03efd90c5d20 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -160,7 +160,20 @@ static struct kmem_cache *bfq_pool; - /* Target observation time interval for a peak-rate update (ns) */ - #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - --/* Shift used for peak rate fixed precision calculations. */ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ - #define BFQ_RATE_SHIFT 16 - - /* -@@ -2881,6 +2894,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - - bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index e2ae11bf8f76..4a54e5076863 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -490,7 +490,12 @@ struct bfq_data { - u32 last_rq_max_size; - /* time elapsed from first dispatch in current observ. interval (us) */ - u64 delta_from_first; -- /* current estimate of device peak rate */ -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ - u32 peak_rate; - - /* maximum budget allotted to a bfq_queue before rescheduling */ -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 7f0cf1f01ffc..e96213865fc2 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -154,7 +154,20 @@ static struct kmem_cache *bfq_pool; - /* Target observation time interval for a peak-rate update (ns) */ - #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - --/* Shift used for peak rate fixed precision calculations. */ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ - #define BFQ_RATE_SHIFT 16 - - /* -@@ -2695,6 +2708,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - - bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - -diff --git a/block/bfq.h b/block/bfq.h -index 4d2fe7f77af1..a25e76c906d9 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -498,7 +498,12 @@ struct bfq_data { - u32 last_rq_max_size; - /* time elapsed from first dispatch in current observ. interval (us) */ - u64 delta_from_first; -- /* current estimate of device peak rate */ -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ - u32 peak_rate; - - /* maximum budget allotted to a bfq_queue before rescheduling */ diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-redcore-lts-amd64.config b/sys-kernel/linux-image-redcore-lts/files/4.14-amd64.config index 23e35863..307b0bd9 100644 --- a/sys-kernel/linux-sources-redcore-lts/files/4.14-redcore-lts-amd64.config +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-amd64.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.14.90-redcore-lts Kernel Configuration +# Linux/x86 4.14.95-redcore-lts-r1 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -432,15 +432,10 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y CONFIG_CFQ_GROUP_IOSCHED=y -CONFIG_IOSCHED_BFQ_SQ=y -CONFIG_BFQ_SQ_GROUP_IOSCHED=y # CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_BFQ_SQ=y +CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq-sq" -CONFIG_MQ_IOSCHED_BFQ=y -CONFIG_MQ_BFQ_GROUP_IOSCHED=y +CONFIG_DEFAULT_IOSCHED="cfq" CONFIG_MQ_IOSCHED_DEADLINE=y # CONFIG_MQ_IOSCHED_KYBER is not set CONFIG_IOSCHED_BFQ=y diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-redcore-lts-amd64.config b/sys-kernel/linux-image-redcore-lts/files/4.19-amd64.config index c5bedf65..f0565a81 100644 --- a/sys-kernel/linux-image-redcore-lts/files/4.19-redcore-lts-amd64.config +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-amd64.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.19.20-redcore-lts Kernel Configuration +# Linux/x86 4.19.20-redcore-lts-r1 Kernel Configuration # # @@ -937,15 +937,10 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y CONFIG_CFQ_GROUP_IOSCHED=y -CONFIG_IOSCHED_BFQ_SQ=y -CONFIG_BFQ_SQ_GROUP_IOSCHED=y # CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_BFQ_SQ=y +CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq-sq" -CONFIG_MQ_IOSCHED_BFQ=y -CONFIG_MQ_BFQ_GROUP_IOSCHED=y +CONFIG_DEFAULT_IOSCHED="cfq" CONFIG_MQ_IOSCHED_DEADLINE=y # CONFIG_MQ_IOSCHED_KYBER is not set CONFIG_IOSCHED_BFQ=y @@ -7564,7 +7559,7 @@ CONFIG_GREYBUS_SDIO=m CONFIG_GREYBUS_SPI=m CONFIG_GREYBUS_UART=m CONFIG_GREYBUS_USB=m -# CONFIG_DRM_VBOXVIDEO is not set +CONFIG_DRM_VBOXVIDEO=m CONFIG_PI433=m CONFIG_MTK_MMC=m # CONFIG_MTK_AEE_KDUMP is not set diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch deleted file mode 100644 index 039c8fcd..00000000 --- a/sys-kernel/linux-image-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch +++ /dev/null @@ -1,18511 +0,0 @@ -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 8d8d8f06cab2..41d0200944f1 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -1,3 +1,6 @@ -+[ THIS TREE CONTAINS ALSO THE DEV VERSION OF BFQ. -+ DETAILS AT THE END OF THIS DOCUMENT. ] -+ - BFQ (Budget Fair Queueing) - ========================== - -@@ -11,6 +14,15 @@ controllers), BFQ's main features are: - groups (switching back to time distribution when needed to keep - throughput high). - -+If bfq-mq patches have been applied, then the following three -+instances of BFQ are available (otherwise only the first instance): -+- bfq: mainline version of BFQ, for blk-mq -+- bfq-mq: development version of BFQ for blk-mq; this version contains -+ also all latest features and fixes not yet landed in mainline, plus many -+ safety checks -+- bfq-sq: BFQ for legacy blk; also this version contains latest features -+ and fixes, as well as safety checks -+ - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - schedules that may lead to a lower throughput. If your main or only -@@ -22,27 +34,42 @@ latency and throughput, or on how to maximize throughput. - - BFQ has a non-null overhead, which limits the maximum IOPS that a CPU - can process for a device scheduled with BFQ. To give an idea of the --limits on slow or average CPUs, here are, first, the limits of BFQ for --three different CPUs, on, respectively, an average laptop, an old --desktop, and a cheap embedded system, in case full hierarchical --support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but -+limits on slow or average CPUs, here are, first, the limits of bfq-mq -+and bfq for three different CPUs, on, respectively, an average laptop, -+an old desktop, and a cheap embedded system, in case full hierarchical -+support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for -+bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but - CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): - - Intel i7-4850HQ: 400 KIOPS - - AMD A8-3850: 250 KIOPS - - ARM CortexTM-A53 Octa-core: 80 KIOPS - --If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical --support is enabled), then the sustainable throughput with BFQ --decreases, because all blkio.bfq* statistics are created and updated --(Section 4-2). For BFQ, this leads to the following maximum --sustainable throughputs, on the same systems as above: -+As for bfq-sq, it cannot reach the above IOPS, because of the -+inherent, lower parallelism of legacy blk and of the components within -+it (including bfq-sq itself). In particular, results with -+CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits -+reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however -+provide a lower bound to the limits of bfq-sq. -+ -+Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and -+of course full hierarchical support is enabled), then the sustainable -+throughput with bfq-mq and bfq decreases, because all blkio.bfq* -+statistics are created and updated (Section 4-2). For bfq-mq and bfq, -+this leads to the following maximum sustainable throughputs, on the -+same systems as above: - - Intel i7-4850HQ: 310 KIOPS - - AMD A8-3850: 200 KIOPS - - ARM CortexTM-A53 Octa-core: 56 KIOPS - --BFQ works for multi-queue devices too. -+Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical -+support is enabled), then bfq-sq exhibits the following limits: -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS - --The table of contents follow. Impatients can just jump to Section 3. -+BFQ works for multi-queue devices too (bfq and bfq-mq instances). -+ -+The table of contents follows. Impatients can just jump to Section 3. - - CONTENTS - -@@ -509,25 +536,27 @@ To get proportional sharing of bandwidth with BFQ for a given device, - BFQ must of course be the active scheduler for that device. - - Within each group directory, the names of the files associated with --BFQ-specific cgroup parameters and stats begin with the "bfq." --prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for --BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group --parameter to set the weight of a group with BFQ is blkio.bfq.weight -+BFQ-specific cgroup parameters and stats begin with the "bfq.", -+"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you -+want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for -+BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" -+(i.e., null string), "-sq" or "-mq". For example, the group parameter -+to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - - As for cgroups-v1 (blkio controller), the exact set of stat files --created, and kept up-to-date by bfq, depends on whether --CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all -+created, and kept up-to-date by bfq*, depends on whether -+CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all - the stat files documented in - Documentation/cgroup-v1/blkio-controller.txt. If, instead, --CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files --blkio.bfq.io_service_bytes --blkio.bfq.io_service_bytes_recursive --blkio.bfq.io_serviced --blkio.bfq.io_serviced_recursive -+CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files -+blkio.bfq*.io_service_bytes -+blkio.bfq*.io_service_bytes_recursive -+blkio.bfq*.io_serviced -+blkio.bfq*.io_serviced_recursive - - The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum --throughput sustainable with bfq, because updating the blkio.bfq.* -+throughput sustainable with bfq*, because updating the blkio.bfq* - stats is rather costly, especially for some of the stats enabled by - CONFIG_DEBUG_BLK_CGROUP. - -@@ -536,7 +565,7 @@ Parameters to set - - For each group, there is only the following parameter to set. - --weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -+weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the - group inside its parent. Available values: 1..10000 (default 100). The - linear mapping between ioprio and weights, described at the beginning - of the tunable section, is still valid, but all weights higher than -@@ -559,3 +588,55 @@ applications. Unset this tunable if you need/want to control weights. - Slightly extended version: - http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- - results.pdf -+ -+---------------------------------------------------------------------- -+ -+DETAILS ON THE DEV VERSIONS IN THIS TREE -+ -+The dev version of BFQ is available for both the legacy and the -+multi-queue block layers, as two additional I/O schedulers, named, -+respectively, bfq-sq-iosched and bfq-mq-iosched (the latter is -+available if also the changes introducing bfq-mq-iosched have been -+applied). In particular, this tree contains the dev version of bfq for -+Linux mainline 4.19.0, and has been obtained from the dev version for -+Linux 4.18.0. Rebasing from 4.18 to 4.19 involved two manual -+interventions. -+ -+First, some conflicts had to be resolved, as follows: -+ -+--------------------------------------------------------------- -+ -+diff --cc Makefile -+index 7727c1bf6fa5,69fa5c0310d8..c7cbdf0ad594 -+--- a/Makefile -++++ b/Makefile -+@@@ -1,9 -1,9 +1,9 @@@ -+ # SPDX-License-Identifier: GPL-2.0 -+ VERSION = 4 -+- PATCHLEVEL = 18 -++ PATCHLEVEL = 19 -+ SUBLEVEL = 0 -+ -EXTRAVERSION = -+ +EXTRAVERSION = -bfq-mq -+- NAME = Merciless Moray -++ NAME = "People's Front" -+ -+ # *DOCUMENTATION* -+ # To see a list of typical targets execute "make help" -+diff --cc include/linux/blkdev.h -+index 897c63322bd7,6980014357d4..8c4568ea6884 -+--- a/include/linux/blkdev.h -++++ b/include/linux/blkdev.h -+@@@ -56,7 -54,7 +54,7 @@@ struct blk_stat_callback -+ * Maximum number of blkcg policies allowed to be registered concurrently. -+ * Defined here to simplify include dependency. -+ */ -+--#define BLKCG_MAX_POLS 5 -+++#define BLKCG_MAX_POLS 7 -+ -+ typedef void (rq_end_io_fn)(struct request *, blk_status_t); -+ -+--------------------------------------------------------------- -+ -+Second, the following port commit had to be made: -+port commit "block: use ktime_get_ns() instead of sched_clock() for cfq and bfq" -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index e32fc1f274d8..94cb28eb20ba 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -12,6 +12,11 @@ CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_LOG_BUF_SHIFT=18 - CONFIG_CGROUPS=y -+CONFIG_BLK_CGROUP=y -+CONFIG_IOSCHED_BFQ_SQ=y -+CONFIG_BFQ_SQ_GROUP_IOSCHED=y -+CONFIG_MQ_IOSCHED_BFQ=y -+CONFIG_MQ_BFQ_GROUP_IOSCHED=y - CONFIG_CGROUP_FREEZER=y - CONFIG_CPUSETS=y - CONFIG_CGROUP_CPUACCT=y -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index a4a8914bf7a4..299a6861fb90 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED - ---help--- - Enable group IO scheduling in CFQ. - -+config IOSCHED_BFQ_SQ -+ tristate "BFQ-SQ I/O scheduler" -+ default n -+ ---help--- -+ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for -+ SingleQueue) distributes bandwidth among all processes -+ according to their weights, regardless of the device -+ parameters and with any workload. It also guarantees a low -+ latency to interactive and soft real-time applications. -+ Details in Documentation/block/bfq-iosched.txt -+ -+config BFQ_SQ_GROUP_IOSCHED -+ bool "BFQ-SQ hierarchical scheduling support" -+ depends on IOSCHED_BFQ_SQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-SQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - choice - - prompt "Default I/O scheduler" -@@ -54,6 +74,16 @@ choice - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - -+ config DEFAULT_BFQ_SQ -+ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y -+ help -+ Selects BFQ-SQ as the default I/O scheduler which will be -+ used by default for all block devices. -+ The BFQ-SQ I/O scheduler aims at distributing the bandwidth -+ as desired, independently of the disk parameters and with -+ any workload. It also tries to guarantee low latency to -+ interactive and soft real-time applications. -+ - config DEFAULT_NOOP - bool "No-op" - -@@ -63,8 +93,28 @@ config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ -+ default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - -+config MQ_IOSCHED_BFQ -+ tristate "BFQ-MQ I/O Scheduler" -+ default y -+ ---help--- -+ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth -+ among all processes according to their weights, regardless of -+ the device parameters and with any workload. It also -+ guarantees a low latency to interactive and soft real-time -+ applications. Details in Documentation/block/bfq-iosched.txt -+ -+config MQ_BFQ_GROUP_IOSCHED -+ bool "BFQ-MQ hierarchical scheduling support" -+ depends on MQ_IOSCHED_BFQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-MQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/Makefile b/block/Makefile -index 572b33f32c07..1dd6ffdc2fee 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -25,6 +25,8 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o - obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -+obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o -+obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -new file mode 100644 -index 000000000000..15459e50cd6a ---- /dev/null -+++ b/block/bfq-cgroup-included.c -@@ -0,0 +1,1359 @@ -+/* -+ * BFQ: CGROUPS support. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ */ -+ -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ -+/* bfqg stats flags */ -+enum bfqg_stats_flags { -+ BFQG_stats_waiting = 0, -+ BFQG_stats_idling, -+ BFQG_stats_empty, -+}; -+ -+#define BFQG_FLAG_FNS(name) \ -+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags |= (1 << BFQG_stats_##name); \ -+} \ -+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags &= ~(1 << BFQG_stats_##name); \ -+} \ -+static int bfqg_stats_##name(struct bfqg_stats *stats) \ -+{ \ -+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -+} \ -+ -+BFQG_FLAG_FNS(waiting) -+BFQG_FLAG_FNS(idling) -+BFQG_FLAG_FNS(empty) -+#undef BFQG_FLAG_FNS -+ -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else -+/* This should be called with the queue_lock held. */ -+#endif -+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -+{ -+ u64 now; -+ -+ if (!bfqg_stats_waiting(stats)) -+ return; -+ -+ now = ktime_get_ns(); -+ if (now > stats->start_group_wait_time) -+ blkg_stat_add(&stats->group_wait_time, -+ now - stats->start_group_wait_time); -+ bfqg_stats_clear_waiting(stats); -+} -+ -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else -+/* This should be called with the queue_lock held. */ -+#endif -+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_waiting(stats)) -+ return; -+ if (bfqg == curr_bfqg) -+ return; -+ stats->start_group_wait_time = ktime_get_ns(); -+ bfqg_stats_mark_waiting(stats); -+} -+ -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else -+/* This should be called with the queue_lock held. */ -+#endif -+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -+{ -+ u64 now; -+ -+ if (!bfqg_stats_empty(stats)) -+ return; -+ -+ now = ktime_get_ns(); -+ if (now > stats->start_empty_time) -+ blkg_stat_add(&stats->empty_time, -+ now - stats->start_empty_time); -+ bfqg_stats_clear_empty(stats); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -+{ -+ blkg_stat_add(&bfqg->stats.dequeue, 1); -+} -+ -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (blkg_rwstat_total(&stats->queued)) -+ return; -+ -+ /* -+ * group is already marked empty. This can happen if bfqq got new -+ * request in parent group and moved to this group while being added -+ * to service tree. Just ignore the event and move on. -+ */ -+ if (bfqg_stats_empty(stats)) -+ return; -+ -+ stats->start_empty_time = ktime_get_ns(); -+ bfqg_stats_mark_empty(stats); -+} -+ -+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_idling(stats)) { -+ u64 now = ktime_get_ns(); -+ -+ if (now > stats->start_idle_time) -+ blkg_stat_add(&stats->idle_time, -+ now - stats->start_idle_time); -+ bfqg_stats_clear_idling(stats); -+ } -+} -+ -+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ stats->start_idle_time = ktime_get_ns(); -+ bfqg_stats_mark_idling(stats); -+} -+ -+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ blkg_stat_add(&stats->avg_queue_size_sum, -+ blkg_rwstat_total(&stats->queued)); -+ blkg_stat_add(&stats->avg_queue_size_samples, 1); -+ bfqg_stats_update_group_wait_time(stats); -+} -+ -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, -+ unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ u64 start_time_ns, -+ u64 io_start_time_ns, -+ unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ u64 now = ktime_get_ns(); -+ -+ if (now > io_start_time_ns) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time_ns); -+ if (io_start_time_ns > start_time_ns) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time_ns - start_time_ns); -+} -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ u64 start_time_ns, -+ u64 io_start_time_ns, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct blkcg_policy blkcg_policy_bfq; -+ -+/* -+ * blk-cgroup policy-related handlers -+ * The following functions help in converting between blk-cgroup -+ * internal structures and BFQ-specific structures. -+ */ -+ -+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -+{ -+ return pd ? container_of(pd, struct bfq_group, pd) : NULL; -+} -+ -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -+{ -+ return pd_to_blkg(&bfqg->pd); -+} -+ -+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -+{ -+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); -+ -+ return pd_to_bfqg(pd); -+} -+ -+/* -+ * bfq_group handlers -+ * The following functions help in navigating the bfq_group hierarchy -+ * by allowing to find the parent of a bfq_group or the bfq_group -+ * associated to a bfq_queue. -+ */ -+ -+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -+{ -+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; -+ -+ return pblkg ? blkg_to_bfqg(pblkg) : NULL; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ return group_entity ? container_of(group_entity, struct bfq_group, -+ entity) : -+ bfqq->bfqd->root_group; -+} -+ -+/* -+ * The following two functions handle get and put of a bfq_group by -+ * wrapping the related blk-cgroup hooks. -+ */ -+ -+static void bfqg_get(struct bfq_group *bfqg) -+{ -+#ifdef BFQ_MQ -+ bfqg->ref++; -+#else -+ blkg_get(bfqg_to_blkg(bfqg)); -+#endif -+} -+ -+static void bfqg_put(struct bfq_group *bfqg) -+{ -+#ifdef BFQ_MQ -+ bfqg->ref--; -+ -+ BUG_ON(bfqg->ref < 0); -+ if (bfqg->ref == 0) -+ kfree(bfqg); -+#else -+ blkg_put(bfqg_to_blkg(bfqg)); -+#endif -+} -+ -+#ifdef BFQ_MQ -+static void bfqg_and_blkg_get(struct bfq_group *bfqg) -+{ -+ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ -+ bfqg_get(bfqg); -+ -+ blkg_get(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_and_blkg_put(struct bfq_group *bfqg) -+{ -+ blkg_put(bfqg_to_blkg(bfqg)); -+ -+ bfqg_put(bfqg); -+} -+#endif -+ -+/* @stats = 0 */ -+static void bfqg_stats_reset(struct bfqg_stats *stats) -+{ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_reset(&stats->merged); -+ blkg_rwstat_reset(&stats->service_time); -+ blkg_rwstat_reset(&stats->wait_time); -+ blkg_stat_reset(&stats->time); -+ blkg_stat_reset(&stats->avg_queue_size_sum); -+ blkg_stat_reset(&stats->avg_queue_size_samples); -+ blkg_stat_reset(&stats->dequeue); -+ blkg_stat_reset(&stats->group_wait_time); -+ blkg_stat_reset(&stats->idle_time); -+ blkg_stat_reset(&stats->empty_time); -+#endif -+} -+ -+/* @to += @from */ -+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) -+{ -+ if (!to || !from) -+ return; -+ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_add_aux(&to->merged, &from->merged); -+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); -+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); -+ blkg_stat_add_aux(&from->time, &from->time); -+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); -+ blkg_stat_add_aux(&to->avg_queue_size_samples, -+ &from->avg_queue_size_samples); -+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); -+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); -+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); -+ blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+#endif -+} -+ -+/* -+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' -+ * recursive stats can still account for the amount used by this bfqg after -+ * it's gone. -+ */ -+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -+{ -+ struct bfq_group *parent; -+ -+ if (!bfqg) /* root_group */ -+ return; -+ -+ parent = bfqg_parent(bfqg); -+ -+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); -+ -+ if (unlikely(!parent)) -+ return; -+ -+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+#ifdef BFQ_MQ -+ /* -+ * Make sure that bfqg and its associated blkg do not -+ * disappear before entity. -+ */ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", -+ bfqg); -+ -+ bfqg_and_blkg_get(bfqg); -+#else -+ bfqg_get(bfqg); -+#endif -+ } -+ entity->parent = bfqg->my_entity; /* NULL for root group */ -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfqg_stats_exit(struct bfqg_stats *stats) -+{ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ blkg_rwstat_exit(&stats->merged); -+ blkg_rwstat_exit(&stats->service_time); -+ blkg_rwstat_exit(&stats->wait_time); -+ blkg_rwstat_exit(&stats->queued); -+ blkg_stat_exit(&stats->time); -+ blkg_stat_exit(&stats->avg_queue_size_sum); -+ blkg_stat_exit(&stats->avg_queue_size_samples); -+ blkg_stat_exit(&stats->dequeue); -+ blkg_stat_exit(&stats->group_wait_time); -+ blkg_stat_exit(&stats->idle_time); -+ blkg_stat_exit(&stats->empty_time); -+#endif -+} -+ -+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -+{ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ if (blkg_rwstat_init(&stats->merged, gfp) || -+ blkg_rwstat_init(&stats->service_time, gfp) || -+ blkg_rwstat_init(&stats->wait_time, gfp) || -+ blkg_rwstat_init(&stats->queued, gfp) || -+ blkg_stat_init(&stats->time, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || -+ blkg_stat_init(&stats->dequeue, gfp) || -+ blkg_stat_init(&stats->group_wait_time, gfp) || -+ blkg_stat_init(&stats->idle_time, gfp) || -+ blkg_stat_init(&stats->empty_time, gfp)) { -+ bfqg_stats_exit(stats); -+ return -ENOMEM; -+ } -+#endif -+ -+ return 0; -+} -+ -+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -+{ -+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -+} -+ -+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -+{ -+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -+} -+ -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), gfp); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ -+static void bfq_cpd_init(struct blkcg_policy_data *cpd) -+{ -+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); -+ -+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? -+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); -+} -+ -+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -+{ -+ struct bfq_group *bfqg; -+ -+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); -+ if (!bfqg) -+ return NULL; -+ -+ if (bfqg_stats_init(&bfqg->stats, gfp)) { -+ kfree(bfqg); -+ return NULL; -+ } -+#ifdef BFQ_MQ -+ /* see comments in bfq_bic_update_cgroup for why refcounting */ -+ bfqg_get(bfqg); -+#endif -+ return &bfqg->pd; -+} -+ -+static void bfq_pd_init(struct blkg_policy_data *pd) -+{ -+ struct blkcg_gq *blkg; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ struct bfq_group_data *d; -+ -+ blkg = pd_to_blkg(pd); -+ BUG_ON(!blkg); -+ bfqg = blkg_to_bfqg(blkg); -+ bfqd = blkg->q->elevator->elevator_data; -+ BUG_ON(bfqg == bfqd->root_group); -+ entity = &bfqg->entity; -+ d = blkcg_to_bfqgd(blkg->blkcg); -+ -+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; -+ entity->my_sched_data = &bfqg->sched_data; -+ bfqg->my_entity = entity; /* -+ * the root_group's will be set to NULL -+ * in bfq_init_queue() -+ */ -+ bfqg->bfqd = bfqd; -+ bfqg->active_entities = 0; -+ bfqg->rq_pos_tree = RB_ROOT; -+} -+ -+static void bfq_pd_free(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_exit(&bfqg->stats); -+#ifdef BFQ_MQ -+ bfqg_put(bfqg); -+#else -+ kfree(bfqg); -+#endif -+} -+ -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) -+{ -+ struct bfq_entity *entity; -+ -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; -+} -+ -+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct blkcg_gq *blkg; -+ -+ blkg = blkg_lookup(blkcg, bfqd->queue); -+ if (likely(blkg)) -+ return blkg_to_bfqg(blkg); -+ return NULL; -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct bfq_group *bfqg, *parent; -+ struct bfq_entity *entity; -+ -+ bfqg = bfq_lookup_bfqg(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ return NULL; -+ -+ /* -+ * Update chain of bfq_groups as we might be handling a leaf group -+ * which, along with some of its relatives, has not been hooked yet -+ * to the private hierarchy of BFQ. -+ */ -+ entity = &bfqg->entity; -+ for_each_entity(entity) { -+ bfqg = container_of(entity, struct bfq_group, entity); -+ BUG_ON(!bfqg); -+ if (bfqg != bfqd->root_group) { -+ parent = bfqg_parent(bfqg); -+ if (!parent) -+ parent = bfqd->root_group; -+ BUG_ON(!parent); -+ bfq_group_set_parent(bfqg, parent); -+ } -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/** -+ * bfq_bfqq_move - migrate @bfqq to @bfqg. -+ * @bfqd: queue descriptor. -+ * @bfqq: the queue to move. -+ * @bfqg: the group to move to. -+ * -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -+ * it on the new one. Avoid putting the entity on the old group idle tree. -+ * -+#ifdef BFQ_MQ -+ * Must be called under the scheduler lock, to make sure that the blkg -+ * owning @bfqg does not disappear (see comments in -+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg -+ * objects). -+#else -+ * Must be called under the queue lock; the cgroup owning @bfqg must -+ * not disappear (by now this just means that we are called under -+ * rcu_read_lock()). -+#endif -+ */ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); -+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) -+ && entity->on_st && -+ bfqq != bfqd->in_service_queue); -+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); -+ -+ /* If bfqq is empty, then bfq_bfqq_expire also invokes -+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity -+ * from data structures related to current group. Otherwise we -+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as -+ * we do below. -+ */ -+ if (bfqq == bfqd->in_service_queue) -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_busy(bfqq)) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ else if (entity->on_st) { -+ BUG_ON(&bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ } -+#ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); -+ -+ bfqg_and_blkg_put(bfqq_group(bfqq)); -+#else -+ bfqg_put(bfqq_group(bfqq)); -+#endif -+ -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+#ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); -+ -+ /* pin down bfqg and its associated blkg */ -+ bfqg_and_blkg_get(bfqg); -+#else -+ bfqg_get(bfqg); -+#endif -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ if (bfq_bfqq_busy(bfqq)) { -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+ -+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+} -+ -+/** -+ * __bfq_bic_change_cgroup - move @bic to @cgroup. -+ * @bfqd: the queue descriptor. -+ * @bic: the bic to move. -+ * @blkcg: the blk-cgroup to move to. -+ * -+#ifdef BFQ_MQ -+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes -+ * sure that the reference to cgroup is valid across the call (see -+ * comments in bfq_bic_update_cgroup on this issue) -+#else -+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller -+ * has to make sure that the reference to cgroup is valid across the call. -+#endif -+ * -+ * NOTE: an alternative approach might have been to store the current -+ * cgroup in bfqq and getting a reference to it, reducing the lookup -+ * time here, at the price of slightly more complex code. -+ */ -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct blkcg *blkcg) -+{ -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); -+ struct bfq_group *bfqg; -+ struct bfq_entity *entity; -+ -+ bfqg = bfq_find_set_group(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ bfqg = bfqd->root_group; -+ -+ if (async_bfqq) { -+ entity = &async_bfqq->entity; -+ -+ if (entity->sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, 0); -+ bfq_log_bfqq(bfqd, async_bfqq, -+ "%p %d", -+ async_bfqq, -+ async_bfqq->ref); -+ bfq_put_queue(async_bfqq); -+ } -+ } -+ -+ if (sync_bfqq) { -+ entity = &sync_bfqq->entity; -+ if (entity->sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_group *bfqg = NULL; -+ uint64_t serial_nr; -+ -+ rcu_read_lock(); -+ serial_nr = bio_blkcg(bio)->css.serial_nr; -+ -+ /* -+ * Check whether blkcg has changed. The condition may trigger -+ * spuriously on a newly created cic but there's no harm. -+ */ -+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) -+ goto out; -+ -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+#ifdef BFQ_MQ -+ /* -+ * Update blkg_path for bfq_log_* functions. We cache this -+ * path, and update it here, for the following -+ * reasons. Operations on blkg objects in blk-cgroup are -+ * protected with the request_queue lock, and not with the -+ * lock that protects the instances of this scheduler -+ * (bfqd->lock). This exposes BFQ to the following sort of -+ * race. -+ * -+ * The blkg_lookup performed in bfq_get_queue, protected -+ * through rcu, may happen to return the address of a copy of -+ * the original blkg. If this is the case, then the -+ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down -+ * the blkg, is useless: it does not prevent blk-cgroup code -+ * from destroying both the original blkg and all objects -+ * directly or indirectly referred by the copy of the -+ * blkg. -+ * -+ * On the bright side, destroy operations on a blkg invoke, as -+ * a first step, hooks of the scheduler associated with the -+ * blkg. And these hooks are executed with bfqd->lock held for -+ * BFQ. As a consequence, for any blkg associated with the -+ * request queue this instance of the scheduler is attached -+ * to, we are guaranteed that such a blkg is not destroyed, and -+ * that all the pointers it contains are consistent, while we -+ * are holding bfqd->lock. A blkg_lookup performed with -+ * bfqd->lock held then returns a fully consistent blkg, which -+ * remains consistent until this lock is held. -+ * -+ * Thanks to the last fact, and to the fact that: (1) bfqg has -+ * been obtained through a blkg_lookup in the above -+ * assignment, and (2) bfqd->lock is being held, here we can -+ * safely use the policy data for the involved blkg (i.e., the -+ * field bfqg->pd) to get to the blkg associated with bfqg, -+ * and then we can safely use any field of blkg. After we -+ * release bfqd->lock, even just getting blkg through this -+ * bfqg may cause dangling references to be traversed, as -+ * bfqg->pd may not exist any more. -+ * -+ * In view of the above facts, here we cache, in the bfqg, any -+ * blkg data we may need for this bic, and for its associated -+ * bfq_queue. As of now, we need to cache only the path of the -+ * blkg, which is used in the bfq_log_* functions. -+ * -+ * Finally, note that bfqg itself needs to be protected from -+ * destruction on the blkg_free of the original blkg (which -+ * invokes bfq_pd_free). We use an additional private -+ * refcounter for bfqg, to let it disappear only after no -+ * bfq_queue refers to it any longer. -+ */ -+ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); -+#endif -+ bic->blkcg_serial_nr = serial_nr; -+out: -+ rcu_read_unlock(); -+} -+ -+/** -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. -+ * @st: the service tree being flushed. -+ */ -+static void bfq_flush_idle_tree(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entity = st->first_idle; -+ -+ for (; entity ; entity = st->first_idle) -+ __bfq_deactivate_entity(entity, false); -+} -+ -+/** -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. -+ * @bfqd: the device data structure with the root group. -+ * @entity: the entity to move. -+ */ -+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(!bfqq); -+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); -+} -+ -+/** -+ * bfq_reparent_active_entities - move to the root group all active -+ * entities. -+ * @bfqd: the device data structure with the root group. -+ * @bfqg: the group to move from. -+ * @st: the service tree with the entities. -+ */ -+static void bfq_reparent_active_entities(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ struct bfq_service_tree *st) -+{ -+ struct rb_root *active = &st->active; -+ struct bfq_entity *entity = NULL; -+ -+ if (!RB_EMPTY_ROOT(&st->active)) -+ entity = bfq_entity_of(rb_first(active)); -+ -+ for (; entity ; entity = bfq_entity_of(rb_first(active))) -+ bfq_reparent_leaf_entity(bfqd, entity); -+ -+ if (bfqg->sched_data.in_service_entity) -+ bfq_reparent_leaf_entity(bfqd, -+ bfqg->sched_data.in_service_entity); -+} -+ -+/** -+ * bfq_pd_offline - deactivate the entity associated with @pd, -+ * and reparent its children entities. -+ * @pd: descriptor of the policy going offline. -+ * -+ * blkio already grabs the queue_lock for us, so no need to use -+ * RCU-based magic -+ */ -+static void bfq_pd_offline(struct blkg_policy_data *pd) -+{ -+ struct bfq_service_tree *st; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+#ifdef BFQ_MQ -+ unsigned long flags; -+#endif -+ int i; -+ -+ BUG_ON(!pd); -+ bfqg = pd_to_bfqg(pd); -+ BUG_ON(!bfqg); -+ bfqd = bfqg->bfqd; -+ BUG_ON(bfqd && !bfqd->root_group); -+ -+ entity = bfqg->my_entity; -+ -+#ifdef BFQ_MQ -+ spin_lock_irqsave(&bfqd->lock, flags); -+#endif -+ -+ if (!entity) /* root group */ -+ goto put_async_queues; -+ -+ /* -+ * Empty all service_trees belonging to this group before -+ * deactivating the group itself. -+ */ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ BUG_ON(!bfqg->sched_data.service_tree); -+ st = bfqg->sched_data.service_tree + i; -+ /* -+ * The idle tree may still contain bfq_queues belonging -+ * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. -+ */ -+ bfq_flush_idle_tree(st); -+ -+ /* -+ * It may happen that some queues are still active -+ * (busy) upon group destruction (if the corresponding -+ * processes have been forced to terminate). We move -+ * all the leaf entities corresponding to these queues -+ * to the root_group. -+ * Also, it may happen that the group has an entity -+ * in service, which is disconnected from the active -+ * tree: it must be moved, too. -+ * There is no need to put the sync queues, as the -+ * scheduler has taken no reference. -+ */ -+ bfq_reparent_active_entities(bfqd, bfqg, st); -+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); -+ } -+ BUG_ON(bfqg->sched_data.next_in_service); -+ BUG_ON(bfqg->sched_data.in_service_entity); -+ -+ __bfq_deactivate_entity(entity, false); -+ -+put_async_queues: -+ bfq_put_async_queues(bfqd, bfqg); -+ -+#ifdef BFQ_MQ -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+#endif -+ /* -+ * @blkg is going offline and will be ignored by -+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -+ * that they don't get lost. If IOs complete after this point, the -+ * stats for them will be lost. Oh well... -+ */ -+ bfqg_stats_xfer_dead(bfqg); -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ struct blkcg_gq *blkg; -+ -+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ BUG_ON(!bfqg); -+ -+ bfq_end_wr_async_queues(bfqd, bfqg); -+ } -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static int bfq_io_show_weight(struct seq_file *sf, void *v) -+{ -+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ unsigned int val = 0; -+ -+ if (bfqgd) -+ val = bfqgd->weight; -+ -+ seq_printf(sf, "%u\n", val); -+ -+ return 0; -+} -+ -+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ struct blkcg_gq *blkg; -+ int ret = -ERANGE; -+ -+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) -+ return ret; -+ -+ ret = 0; -+ spin_lock_irq(&blkcg->lock); -+ bfqgd->weight = (unsigned short)val; -+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ if (!bfqg) -+ continue; -+ /* -+ * Setting the prio_changed flag of the entity -+ * to 1 with new_weight == weight would re-set -+ * the value of the weight to its ioprio mapping. -+ * Set the flag only if necessary. -+ */ -+ if ((unsigned short)val != bfqg->entity.new_weight) { -+ bfqg->entity.new_weight = (unsigned short)val; -+ /* -+ * Make sure that the above new value has been -+ * stored in bfqg->entity.new_weight before -+ * setting the prio_changed flag. In fact, -+ * this flag may be read asynchronously (in -+ * critical sections protected by a different -+ * lock than that held here), and finding this -+ * flag set may cause the execution of the code -+ * for updating parameters whose value may -+ * depend also on bfqg->entity.new_weight (in -+ * __bfq_entity_update_weight_prio). -+ * This barrier makes sure that the new value -+ * of bfqg->entity.new_weight is correctly -+ * seen in that code. -+ */ -+ smp_wmb(); -+ bfqg->entity.prio_changed = 1; -+ } -+ } -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) -+{ -+ u64 weight; -+ /* First unsigned long found in the file is used */ -+ int ret = kstrtoull(strim(buf), 0, &weight); -+ -+ if (ret) -+ return ret; -+ -+ ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); -+ return ret ?: nbytes; -+} -+ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+static int bfqg_print_stat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, off); -+ return __blkg_prfill_u64(sf, pd, sum); -+} -+ -+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, -+ off); -+ return __blkg_prfill_rwstat(sf, pd, &sum); -+} -+ -+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, -+ int off) -+{ -+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, -+ offsetof(struct blkcg_gq, stat_bytes)); -+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + -+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, -+ false); -+ return 0; -+} -+ -+ -+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); -+ u64 v = 0; -+ -+ if (samples) { -+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); -+ v = div64_u64(v, samples); -+ } -+ __blkg_prfill_u64(sf, pd, v); -+ return 0; -+} -+ -+/* print avg_queue_size */ -+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, -+ 0, false); -+ return 0; -+} -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ int ret; -+ -+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); -+ if (ret) -+ return NULL; -+ -+ return blkg_to_bfqg(bfqd->queue->root_blkg); -+} -+ -+#ifdef BFQ_MQ -+#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param -+#else -+#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param -+#endif -+ -+static struct cftype bfq_blkcg_legacy_files[] = { -+ { -+ .name = BFQ_CGROUP_FNAME(weight), -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write_u64 = bfq_io_set_weight_legacy, -+ }, -+ -+ /* statistics, covers only the tasks in the bfqg */ -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_bytes), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_serviced), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios, -+ }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors), -+ .seq_show = bfqg_print_stat_sectors, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_time), -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_wait_time), -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_merged), -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_queued), -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat, -+ }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ -+ -+ /* the same statictics which cover the bfqg and its descendants */ -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios_recursive, -+ }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time_recursive), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_merged_recursive), -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_queued_recursive), -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(avg_queue_size), -+ .seq_show = bfqg_print_avg_queue_size, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(group_wait_time), -+ .private = offsetof(struct bfq_group, stats.group_wait_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(idle_time), -+ .private = offsetof(struct bfq_group, stats.idle_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(empty_time), -+ .private = offsetof(struct bfq_group, stats.empty_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(dequeue), -+ .private = offsetof(struct bfq_group, stats.dequeue), -+ .seq_show = bfqg_print_stat, -+ }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ -+ { } /* terminate */ -+}; -+ -+static struct cftype bfq_blkg_files[] = { -+ { -+ .name = BFQ_CGROUP_FNAME(weight), -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write = bfq_io_set_weight, -+ }, -+ {} /* terminate */ -+}; -+ -+#undef BFQ_CGROUP_FNAME -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) {} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ } -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ return bfqd->root_group; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ struct bfq_group *bfqg; -+ int i; -+ -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); -+ if (!bfqg) -+ return NULL; -+ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ -+ return bfqg; -+} -+#endif -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c -new file mode 100644 -index 000000000000..fb7bb8f08b75 ---- /dev/null -+++ b/block/bfq-ioc.c -@@ -0,0 +1,36 @@ -+/* -+ * BFQ: I/O context handling. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> -+ */ -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * -+ * Queue lock must be held. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc) -+{ -+ if (ioc) -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); -+ return NULL; -+} -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -new file mode 100644 -index 000000000000..47a49d9e6512 ---- /dev/null -+++ b/block/bfq-mq-iosched.c -@@ -0,0 +1,6548 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * In particular, BFQ schedules I/O so as to achieve the latter goal-- -+ * low latency for interactive and soft real-time applications--if the -+ * low_latency parameter is set (default configuration). To this -+ * purpose, BFQ constantly tries to detect whether the I/O requests in -+ * a bfq_queue come from an interactive or a soft real-time -+ * application. For brevity, in these cases, the queue is said to be -+ * interactive or soft real-time. In both cases, BFQ privileges the -+ * service of the queue, over that of non-interactive and -+ * non-soft-real-time queues. This privileging is performed, mainly, -+ * by raising the weight of the queue. So, for brevity, we call just -+ * weight-raising periods the time periods during which a queue is -+ * privileged, because deemed interactive or soft real-time. -+ * -+ * The detection of soft real-time queues/applications is described in -+ * detail in the comments on the function -+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an -+ * interactive queue works as follows: a queue is deemed interactive -+ * if it is constantly non empty only for a limited time interval, -+ * after which it does become empty. The queue may be deemed -+ * interactive again (for a limited time), if it restarts being -+ * constantly non empty, provided that this happens only after the -+ * queue has remained empty for a given minimum idle time. -+ * -+ * By default, BFQ computes automatically the above maximum time -+ * interval, i.e., the time interval after which a constantly -+ * non-empty queue stops being deemed interactive. Since a queue is -+ * weight-raised while it is deemed interactive, this maximum time -+ * interval happens to coincide with the (maximum) duration of the -+ * weight-raising for interactive queues. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, -+ * more theoretical paper on BFQ can be found. The interested reader -+ * can find in the latter paper full details on the main algorithm, as -+ * well as formulas of the guarantees and formal proofs of all the -+ * properties. With respect to the version of BFQ presented in these -+ * papers, this implementation adds a few more heuristics, such as the -+ * one that guarantees a low latency to soft real-time applications, -+ * and a hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include <linux/sbitmap.h> -+#include <linux/delay.h> -+ -+#include "blk.h" -+#include "blk-mq.h" -+#include "blk-mq-tag.h" -+#include "blk-mq-sched.h" -+#include "bfq-mq.h" -+#include "blk-wbt.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * When a sync request is dispatched, the queue that contains that -+ * request, and all the ancestor entities of that queue, are charged -+ * with the number of sectors of the request. In constrast, if the -+ * request is async, then the queue and its ancestor entities are -+ * charged with the number of sectors of the request, multiplied by -+ * the factor below. This throttles the bandwidth for async I/O, -+ * w.r.t. to sync I/O, and it is done to counter the tendency of async -+ * writes to steal I/O throughput to reads. -+ * -+ * The current value of this parameter is the result of a tuning with -+ * several hardware and software configurations. We tried to find the -+ * lowest value for which writes do not cause noticeable problems to -+ * reads. In fact, the lower this parameter, the stabler I/O control, -+ * in the following respect. The lower this parameter is, the less -+ * the bandwidth enjoyed by a group decreases -+ * - when the group does writes, w.r.t. to when it does reads; -+ * - when other groups do reads, w.r.t. to when they do writes. -+ */ -+static const int bfq_async_charge_factor = 3; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 3 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ -+ (get_sdist(last_pos, rq) > \ -+ BFQQ_SEEK_THR && \ -+ (!blk_queue_nonrot(bfqd->queue) || \ -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * When configured for computing the duration of the weight-raising -+ * for interactive queues automatically (see the comments at the -+ * beginning of this file), BFQ does it using the following formula: -+ * duration = (ref_rate / r) * ref_wr_duration, -+ * where r is the peak rate of the device, and ref_rate and -+ * ref_wr_duration are two reference parameters. In particular, -+ * ref_rate is the peak rate of the reference storage device (see -+ * below), and ref_wr_duration is about the maximum time needed, with -+ * BFQ and while reading two files in parallel, to load typical large -+ * applications on the reference device (see the comments on -+ * max_service_from_wr below, for more details on how ref_wr_duration -+ * is obtained). In practice, the slower/faster the device at hand -+ * is, the more/less it takes to load applications with respect to the -+ * reference device. Accordingly, the longer/shorter BFQ grants -+ * weight raising to interactive applications. -+ * -+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), -+ * depending on whether the device is rotational or non-rotational. -+ * -+ * In the following definitions, ref_rate[0] and ref_wr_duration[0] -+ * are the reference values for a rotational device, whereas -+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a -+ * non-rotational device. The reference rates are not the actual peak -+ * rates of the devices used as a reference, but slightly lower -+ * values. The reason for using slightly lower values is that the -+ * peak-rate estimator tends to yield slightly lower values than the -+ * actual peak rate (it can yield the actual peak rate only if there -+ * is only one process doing I/O, and the process does sequential -+ * I/O). -+ * -+ * The reference peak rates are measured in sectors/usec, left-shifted -+ * by BFQ_RATE_SHIFT. -+ */ -+static int ref_rate[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize -+ * the following array, which entails that the array can be -+ * initialized only in a function. -+ */ -+static int ref_wr_duration[2]; -+ -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transferred. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * @q: the request queue. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc, -+ struct request_queue *q) -+{ -+ if (ioc) { -+ unsigned long flags; -+ struct bfq_io_cq *icq; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return icq; -+ } -+ -+ return NULL; -+} -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, ""); -+ blk_mq_run_hw_queues(bfqd->queue, true); -+ } -+} -+ -+#define BFQ_MQ -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+/* -+ * Async I/O can easily starve sync I/O (both sync reads and sync -+ * writes), by consuming all tags. Similarly, storms of sync writes, -+ * such as those that sync(2) may trigger, can starve sync reads. -+ * Limit depths of async I/O and sync writes so as to counter both -+ * problems. -+ */ -+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) -+{ -+ struct bfq_data *bfqd = data->q->elevator->elevator_data; -+ -+ if (op_is_sync(op) && !op_is_write(op)) -+ return; -+ -+ data->shallow_depth = -+ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; -+ -+ bfq_log(bfqd, "wr_busy %d sync %d depth %u", -+ bfqd->wr_busy_queues, op_is_sync(op), -+ data->shallow_depth); -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "%llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_better_to_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 4) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly -+ * the last two symmetry sub-conditions above would be quite complex -+ * and time consuming. Therefore this function evaluates, instead, -+ * only the following stronger three sub-conditions, for which it is -+ * much easier to maintain the needed state: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) there are no active groups. -+ * In particular, the last condition is always true if hierarchical -+ * support or the cgroups interface are not enabled, thus no state -+ * needs to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ /* -+ * For queue weights to differ, queue_weights_tree must contain -+ * at least two nodes. -+ */ -+ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right); -+ -+ bool multiple_classes_busy = -+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || -+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || -+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); -+ -+ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", -+ varied_queue_weights, multiple_classes_busy); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log(bfqd, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+#endif -+ -+ return !(varied_queue_weights || multiple_classes_busy -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ || bfqd->num_groups_with_pending_reqs > 0 -+#endif -+ ); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input queue, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the queue is already associated with a -+ * counter, which happens if: -+ * 1) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 2) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (bfqq->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ bfqq->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of queue to not be -+ * considered in bfq_symmetric_scenario, which, in its turn, -+ * causes the scenario to be deemed wrongly symmetric in case -+ * bfqq's weight would have been the only weight making the -+ * scenario asymmetric. On the bright side, no unbalance will -+ * however occur when bfqq becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of queue). In fact, bfq_weights_tree_remove does nothing -+ * if !bfqq->weight_counter. -+ */ -+ if (unlikely(!bfqq->weight_counter)) -+ return; -+ -+ bfqq->weight_counter->weight = entity->weight; -+ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); -+ rb_insert_color(&bfqq->weight_counter->weights_node, root); -+ -+inc_counter: -+ bfqq->weight_counter->num_active++; -+ bfqq->ref++; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+} -+ -+/* -+ * Decrement the weight counter associated with the queue, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void __bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (!bfqq->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(bfqq->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!bfqq->weight_counter->num_active); -+ bfqq->weight_counter->num_active--; -+ -+ if (bfqq->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&bfqq->weight_counter->weights_node, root); -+ kfree(bfqq->weight_counter); -+ -+reset_entity_pointer: -+ bfqq->weight_counter = NULL; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+ bfq_put_queue(bfqq); -+} -+ -+/* -+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number -+ * of active groups for each queue's inactive parent entity. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = bfqq->entity.parent; -+ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->my_sched_data; -+ -+ BUG_ON(entity->sched_data == NULL); /* -+ * It would mean -+ * that this is -+ * the root group. -+ */ -+ -+ if (sd->next_in_service || sd->in_service_entity) { -+ BUG_ON(!entity->in_groups_with_pending_reqs); -+ /* -+ * entity is still active, because either -+ * next_in_service or in_service_entity is not -+ * NULL (see the comments on the definition of -+ * next_in_service for details on why -+ * in_service_entity must be checked too). -+ * -+ * As a consequence, its parent entities are -+ * active as well, and thus this loop must -+ * stop here. -+ */ -+ break; -+ } -+ -+ BUG_ON(!bfqd->num_groups_with_pending_reqs && -+ entity->in_groups_with_pending_reqs); -+ /* -+ * The decrement of num_groups_with_pending_reqs is -+ * not performed immediately upon the deactivation of -+ * entity, but it is delayed to when it also happens -+ * that the first leaf descendant bfqq of entity gets -+ * all its pending requests completed. The following -+ * instructions perform this delayed decrement, if -+ * needed. See the comments on -+ * num_groups_with_pending_reqs for details. -+ */ -+ if (entity->in_groups_with_pending_reqs) { -+ entity->in_groups_with_pending_reqs = false; -+ bfqd->num_groups_with_pending_reqs--; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+ } -+ -+ /* -+ * Next function is invoked last, because it causes bfqq to be -+ * freed if the following holds: bfqq is not in service and -+ * has no dispatched request. DO NOT use bfqq after the next -+ * function invocation. -+ */ -+ __bfq_weights_tree_remove(bfqd, bfqq, -+ &bfqd->queue_weights_tree); -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqq->bfqd)) -+ return blk_rq_sectors(rq); -+ -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, -+ max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)), -+ entity->service); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq, false); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->rate_dur_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 25 seconds. The upper limit -+ * has been conservatively set after the following worst case: -+ * on a QEMU/KVM virtual machine -+ * - running in a slow PC -+ * - with a virtual disk stacked on a slow low-end 5400rpm HDD -+ * - serving a heavy I/O workload, such as the sequential reading -+ * of several files -+ * mplayer took 23 seconds to start, if constantly weight-raised. -+ * -+ * As for higher values than that accomodating the above bad -+ * scenario, tests show that higher values would often yield -+ * the opposite of the desired result, i.e., would worsen -+ * responsiveness by allowing non-interactive applications to -+ * preserve weight raising for too long. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); -+} -+ -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->ttime = bic->saved_ttime; -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ } -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(&bfqq->bfqd->lock); -+ -+ io_refs = bfqq->allocated; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - -+ (bfqq->weight_counter != NULL); -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (entity->budget < entity->service) { -+ pr_crit("budget %d service %d\n", -+ entity->budget, entity->service); -+ BUG(); -+ } -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ /* -+ * In the next compound condition, we check also whether there -+ * is some budget left, because otherwise there is no point in -+ * trying to go on serving bfqq with this same budget: bfqq -+ * would be expired immediately after being selected for -+ * service. This would only cause useless overhead. -+ */ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && -+ bfq_bfqq_budget_left(bfqq) > 0) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ -+ /* -+ * At this point, we have used entity->service to get -+ * the budget left (needed for updating -+ * entity->budget). Thus we finally can, and have to, -+ * reset entity->service. The latter must be reset -+ * because bfqq would otherwise be charged again for -+ * the service it has received during its previous -+ * service slot(s). -+ */ -+ entity->service = 0; -+ -+ return true; -+ } -+ -+ /* -+ * We can finally complete expiration, by setting service to 0. -+ */ -+ entity->service = 0; -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->service_from_wr = 0; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ bfqq->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start) && -+ bfqq->dispatched == 0; -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(RQ_BFQQ(rq) != bfqq); -+ WARN_ON(blk_rq_sectors(rq) == 0); -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ BUG_ON(!RQ_BFQQ(next_rq)); -+ BUG_ON(RQ_BFQQ(next_rq) != bfqq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio, -+ struct request_queue *q) -+{ -+ struct bfq_queue *bfqq = bfqd->bio_bfqq; -+ -+ BUG_ON(!bfqd->bio_bfqq_set); -+ -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+#if 0 /* Still not clear if we can do without next two functions */ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+#endif -+ -+static void bfq_remove_request(struct request_queue *q, -+ struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { -+ pr_crit("no bfqq! for next rq %p bfqq %p\n", -+ bfqq->next_rq, bfqq); -+ } -+ -+ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); -+ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { -+ pr_crit( -+ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", -+ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); -+ } -+ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); -+ -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ elv_rqhash_del(q, rq); -+ if (q->last_merge == rq) -+ q->last_merge = NULL; -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+} -+ -+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *free = NULL; -+ /* -+ * bfq_bic_lookup grabs the queue_lock: invoke it now and -+ * store its return value for later use, to avoid nesting -+ * queue_lock inside the bfqd->lock. We assume that the bic -+ * returned by bfq_bic_lookup does not go away before -+ * bfqd->lock is taken. -+ */ -+ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); -+ bool ret; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ if (bic) -+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ else -+ bfqd->bio_bfqq = NULL; -+ bfqd->bio_bic = bic; -+ /* Set next flag just for testing purposes */ -+ bfqd->bio_bfqq_set = true; -+ -+ ret = blk_mq_sched_try_merge(q, bio, &free); -+ -+ /* -+ * XXX Not yet freeing without lock held, to avoid an -+ * inconsistency with respect to the lock-protected invocation -+ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting -+ * for clarifications from Jens. -+ */ -+ if (free) -+ blk_mq_free_request(free); -+ bfqd->bio_bfqq_set = false; -+ spin_unlock_irq(&bfqd->lock); -+ -+ return ret; -+} -+ -+static int bfq_request_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio, q); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ bfq_log(bfqd, "req %p", __rq); -+ -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static struct bfq_queue *bfq_init_rq(struct request *rq); -+ -+static void bfq_request_merged(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ BUG_ON(req->rq_flags & RQF_DISP_LIST); -+ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = bfq_init_rq(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ BUG_ON(!RQ_BFQQ(req)); -+ BUG_ON(RQ_BFQQ(req) != bfqq); -+ elv_rb_add(&bfqq->sort_list, req); -+ -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ -+ bfqq->next_rq = next_rq; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "req %p prev %p next_rq %p bfqq %p", -+ req, prev, next_rq, bfqq); -+ -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+/* -+ * This function is called to notify the scheduler that the requests -+ * rq and 'next' have been merged, with 'next' going away. BFQ -+ * exploits this hook to address the following issue: if 'next' has a -+ * fifo_time lower that rq, then the fifo_time of rq must be set to -+ * the value of 'next', to not forget the greater age of 'next'. -+ * -+ * NOTE: in this function we assume that rq is in a bfq_queue, basing -+ * on that rq is picked from the hash table q->elevator->hash, which, -+ * in its turn, is filled only with I/O requests present in -+ * bfq_queues, while BFQ is in use for the request queue q. In fact, -+ * the function that fills this hash table (elv_rqhash_add) is called -+ * only by bfq_insert_request. -+ */ -+static void bfq_requests_merged(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = bfq_init_rq(rq), -+ *next_bfqq = bfq_init_rq(next); -+ -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(!RQ_BFQQ(next)); /* this does not imply next is in a bfqq */ -+ BUG_ON(rq->rq_flags & RQF_DISP_LIST); -+ BUG_ON(next->rq_flags & RQF_DISP_LIST); -+ -+ lockdep_assert_held(&bfqq->bfqd->lock); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "rq %p next %p bfqq %p next_bfqq %p", -+ rq, next, bfqq, next_bfqq); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(&bfqd->lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because -+ * we are in the context of the process owning bfqq, thus we -+ * have the io_cq of this process. So we can immediately -+ * configure this io_cq to redirect the requests of the -+ * process to new_bfqq. In contrast, the io_cq of new_bfqq is -+ * not available any more (new_bfqq->bic == NULL). -+ * -+ * Anyway, even in case new_bfqq coincides with the in-service -+ * queue, redirecting requests the in-service queue is the -+ * best option, as we feed the in-service queue with new -+ * requests close to the last request served and, by doing so, -+ * are likely to increase the throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); -+ return false; -+ } -+ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfq_tot_busy_queues(bfqd) == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_ttime = bfqq->ttime; -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > -+ bfq_tot_busy_queues(bfqd)); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; -+ -+ assert_spin_locked(&bfqd->lock); -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ */ -+ BUG_ON(!bfqd->bio_bfqq_set); -+ if (!bfqq) -+ return false; -+ -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ BUG_ON(new_bfqq == bfqq); -+ if (new_bfqq) { -+ /* -+ * bic still points to bfqq, then it has not yet been -+ * redirected to some other bfq_queue, and a queue -+ * merge beween bfqq and new_bfqq can be safely -+ * fulfillled, i.e., bic can be redirected to new_bfqq -+ * and bfqq can be put. -+ */ -+ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, -+ new_bfqq); -+ /* -+ * If we get here, bio will be queued into new_queue, -+ * so use new_bfqq to decide whether bio and rq can be -+ * merged. -+ */ -+ bfqq = new_bfqq; -+ -+ /* -+ * Change also bqfd->bio_bfqq, as -+ * bfqd->bio_bic now points to new_bfqq, and -+ * this function may be invoked again (and then may -+ * use again bqfd->bio_bfqq). -+ */ -+ bfqd->bio_bfqq = bfqq; -+ } -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "cur-budget = %d prio_class %d", -+ bfqq->entity.budget, bfqq->ioprio_class); -+ } else -+ bfq_log(bfqd, "NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on the ref_wr_duration array. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ if (RQ_BFQQ(rq) == bfqd->in_service_queue) -+ bfqd->in_serv_last_pos = bfqd->last_position; -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Remove request from internal lists. -+ */ -+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been -+ * executed after removing the request from the queue and -+ * dispatching it. We execute instead this instruction before -+ * bfq_remove_request() (and hence introduce a temporary -+ * inconsistency), for efficiency. In fact, should this -+ * dispatch occur for a non in-service bfqq, this anticipated -+ * increment prevents two counters related to bfqq->dispatched -+ * from risking to be, first, uselessly decremented, and then -+ * incremented again when the (new) value of bfqq->dispatched -+ * happens to be taken into account. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq, true); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. -+ * -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) -+{ -+ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ blk_queue_nonrot(bfqq->bfqd->queue) && -+ bfqq->bfqd->hw_tag; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. And we do it, unless bfqq is in -+ * interactive weight raising. We do not do it in the -+ * latter subcase, for the following reason. bfqq may -+ * be conveying the I/O needed to load a soft -+ * real-time application. Such an application will -+ * actually exhibit a soft real-time I/O pattern after -+ * it finally starts doing its job. But, if -+ * soft_rt_next_start is computed here for an -+ * interactive bfqq, and bfqq had received a lot of -+ * service before remaining with no outstanding -+ * request (likely to happen on a fast device), then -+ * soft_rt_next_start would be assigned such a high -+ * value that, for a very long time, bfqq would be -+ * prevented from being possibly considered as soft -+ * real time. -+ * -+ * If, instead, the queue still has outstanding -+ * requests, then we have to wait for the completion -+ * of all the outstanding requests to discover whether -+ * the request pattern is actually isochronous. -+ */ -+ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); -+ if (bfqq->dispatched == 0 && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else if (bfqq->dispatched > 0) { -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", -+ reason_name[reason], slow, bfqq->dispatched, -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight, -+ entity->service, entity->budget); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ if (ref == 1) /* bfqq is gone, no more actions on it */ -+ return; -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ bfqq->injected_service = 0; -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (!bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(bfqq->next_rq); -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+ /* -+ * Not setting service to 0, because, if the next rq -+ * arrives in time, the queue will go on receiving -+ * service with this same budget (as if it never expired) -+ */ -+ } else { -+ entity->service = 0; -+ bfq_log_bfqq(bfqd, bfqq, "resetting service"); -+ } -+ -+ /* -+ * Reset the received-service counter for every parent entity. -+ * Differently from what happens with bfqq->entity.service, -+ * the resetting of this counter never needs to be postponed -+ * for parent entities. In fact, in case bfqq may have a -+ * chance to go on being served using the last, partially -+ * consumed budget, bfqq->entity.service needs to be kept, -+ * because if bfqq then actually goes on being served using -+ * the same budget, the last value of bfqq->entity.service is -+ * needed to properly decrement bfqq->entity.budget by the -+ * portion already consumed. In contrast, it is not necessary -+ * to keep entity->service for parent entities too, because -+ * the bubble up of the new value of bfqq->entity.budget will -+ * make sure that the budgets of parent entities are correct, -+ * even in case bfqq and thus parent entities go on receiving -+ * service with the same budget. -+ */ -+ entity = entity->parent; -+ for_each_entity(entity) -+ entity->service = 0; -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. -+ */ -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); -+ -+ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); -+ -+ /* -+ * The return value of this function is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the return value if -+ * there are weight-raised busy queues. In this case, and if -+ * bfqq is not weight-raised, this guarantees that the device -+ * is not idled for bfqq (if, instead, bfqq is weight-raised, -+ * then idling will be guaranteed by another variable, see -+ * below). Combined with the timestamping rules of BFQ (see -+ * [1] for details), this behavior causes bfqq, and hence any -+ * sync non-weight-raised queue, to get a lower number of -+ * requests served, and thus to ask for a lower number of -+ * requests from the request pool, before the busy -+ * weight-raised queues get served again. This often mitigates -+ * starvation problems in the presence of heavy write -+ * workloads and NCQ, thereby guaranteeing a higher -+ * application and system responsiveness in these hostile -+ * scenarios. -+ */ -+ return idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+} -+ -+/* -+ * There is a case where idling must be performed not for -+ * throughput concerns, but to preserve service guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) the I/O of each process has the same properties, in -+ * terms of locality (sequential or random), direction -+ * (reads or writes), request sizes, greediness -+ * (from I/O-bound to sporadic), and so on. -+ * In fact, in such a scenario, the drive tends to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * The problem is that idling may significantly reduce -+ * throughput with certain combinations of types of I/O and -+ * devices. An important example is sync random I/O, on flash -+ * storage with command queueing. So, unless bfqq falls in the -+ * above cases where idling also boosts throughput, it would -+ * be important to check conditions (i) and (ii) accurately, -+ * so as to avoid idling when not strictly needed for service -+ * guarantees. -+ * -+ * Unfortunately, it is extremely difficult to thoroughly -+ * check condition (ii). And, in case there are active groups, -+ * it becomes very difficult to check condition (i) too. In -+ * fact, if there are active groups, then, for condition (i) -+ * to become false, it is enough that an active group contains -+ * more active processes or sub-groups than some other active -+ * group. More precisely, for condition (i) to hold because of -+ * such a group, it is not even necessary that the group is -+ * (still) active: it is sufficient that, even if the group -+ * has become inactive, some of its descendant processes still -+ * have some request already dispatched but still waiting for -+ * completion. In fact, requests have still to be guaranteed -+ * their share of the throughput even after being -+ * dispatched. In this respect, it is easy to show that, if a -+ * group frequently becomes inactive while still having -+ * in-flight requests, and if, when this happens, the group is -+ * not considered in the calculation of whether the scenario -+ * is asymmetric, then the group may fail to be guaranteed its -+ * fair share of the throughput (basically because idling may -+ * not be performed for the descendant processes of the group, -+ * but it had to be). We address this issue with the -+ * following bi-modal behavior, implemented in the function -+ * bfq_symmetric_scenario(). -+ * -+ * If there are groups with requests waiting for completion -+ * (as commented above, some of these groups may even be -+ * already inactive), then the scenario is tagged as -+ * asymmetric, conservatively, without checking any of the -+ * conditions (i) and (ii). So the device is idled for bfqq. -+ * This behavior matches also the fact that groups are created -+ * exactly if controlling I/O is a primary concern (to -+ * preserve bandwidth and latency guarantees). -+ * -+ * On the opposite end, if there are no groups with requests -+ * waiting for completion, then only condition (i) is actually -+ * controlled, i.e., provided that condition (i) holds, idling -+ * is not performed, regardless of whether condition (ii) -+ * holds. In other words, only if condition (i) does not hold, -+ * then idling is allowed, and the device tends to be -+ * prevented from queueing many requests, possibly of several -+ * processes. Since there are no groups with requests waiting -+ * for completion, then, to control condition (i) it is enough -+ * to check just whether all the queues with requests waiting -+ * for completion also have the same weight. -+ * -+ * Not checking condition (ii) evidently exposes bfqq to the -+ * risk of getting less throughput than its fair share. -+ * However, for queues with the same weight, a further -+ * mechanism, preemption, mitigates or even eliminates this -+ * problem. And it does so without consequences on overall -+ * throughput. This mechanism and its benefits are explained -+ * in the next three paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * The motivation for using preemption instead of idling (for -+ * queues with the same weight) is that, by not idling, -+ * service guarantees are preserved (completely or at least in -+ * part) without minimally sacrificing throughput. And, if -+ * there is no active group, then the primary expectation for -+ * this device is probably a high throughput. -+ * -+ * We are now left only with explaining the additional -+ * compound condition that is checked below for deciding -+ * whether the scenario is asymmetric. To explain this -+ * compound condition, we need to add that the function -+ * bfq_symmetric_scenario checks the weights of only -+ * non-weight-raised queues, for efficiency reasons (see -+ * comments on bfq_weights_tree_add()). Then the fact that -+ * bfqq is weight-raised is checked explicitly here. More -+ * precisely, the compound condition below takes into account -+ * also the fact that, even if bfqq is being weight-raised, -+ * the scenario is still symmetric if all queues with requests -+ * waiting for completion happen to be -+ * weight-raised. Actually, we should be even more precise -+ * here, and differentiate between interactive weight raising -+ * and soft real-time weight raising. -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && -+ bfqd->wr_busy_queues < -+ bfq_tot_busy_queues(bfqd)) || -+ !bfq_symmetric_scenario(bfqd); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_coeff %d wr_busy %d busy %d asymmetric %d", -+ bfqq->wr_coeff, -+ bfqd->wr_busy_queues, -+ bfq_tot_busy_queues(bfqd), -+ asymmetric_scenario); -+ -+ return asymmetric_scenario; -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * Most of the issues taken into account to get the return value of -+ * this function are not trivial. We discuss these issues in the two -+ * functions providing the main pieces of information needed by this -+ * function. -+ */ -+static bool bfq_better_to_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; -+ -+ if (unlikely(bfqd->strict_guarantees)) -+ return true; -+ -+ /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ idling_boosts_thr_with_no_issue = -+ idling_boosts_thr_without_issues(bfqd, bfqq); -+ -+ idling_needed_for_service_guar = -+ idling_needed_for_service_guarantees(bfqd, bfqq); -+ -+ /* -+ * We have now the two components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_with_no_issue, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guar); -+ -+ return idling_boosts_thr_with_no_issue || -+ idling_needed_for_service_guar; -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_better_to_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_better_to_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_better_to_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); -+} -+ -+static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * A linear search; but, with a high probability, very few -+ * steps are needed to find a candidate queue, i.e., a queue -+ * with enough budget left for its next request. In fact: -+ * - BFQ dynamically updates the budget of every queue so as -+ * to accomodate the expected backlog of the queue; -+ * - if a queue gets all its requests dispatched as injected -+ * service, then the queue is removed from the active list -+ * (and re-added only if it gets new requests, but with -+ * enough budget for its new backlog). -+ */ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= -+ bfq_bfqq_budget_left(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ return bfqq; -+ } -+ -+ bfq_log(bfqd, "no queue found"); -+ return NULL; -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); -+ -+ /* -+ * Do not expire bfqq for budget timeout if bfqq may be about -+ * to enjoy device idling. The reason why, in this case, we -+ * prevent bfqq from expiring is the same as in the comments -+ * on the case where bfq_bfqq_must_idle() returns true, in -+ * bfq_completed_request(). -+ */ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ * -+ * Yet, to boost throughput, inject service from other queues if -+ * possible. -+ */ -+ if (bfq_bfqq_wait_request(bfqq) || -+ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { -+ if (bfq_bfqq_injectable(bfqq) && -+ bfqq->injected_service * bfqq->inject_coeff < -+ bfqq->entity.service * 10) { -+ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); -+ bfqq = bfq_choose_bfqq_for_injection(bfqd); -+ } else { -+ if (BFQQ_SEEKY(bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "injection saturated %d * %d >= %d * 10", -+ bfqq->injected_service, bfqq->inject_coeff, -+ bfqq->entity.service); -+ bfqq = NULL; -+ } -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ else -+ bfq_log(bfqd, "no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "too much service"); -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch next request from bfqq. -+ */ -+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_remove(bfqd->queue, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->dispatched); -+ -+ if (bfqq != bfqd->in_service_queue) { -+ if (likely(bfqd->in_service_queue)) { -+ bfqd->in_service_queue->injected_service += -+ bfq_serv_to_charge(rq, bfqq); -+ bfq_log_bfqq(bfqd, bfqd->in_service_queue, -+ "injected_service increased to %d", -+ bfqd->in_service_queue->injected_service); -+ } -+ goto return_rq; -+ } -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ /* -+ * Expire bfqq, pretending that its budget expired, if bfqq -+ * belongs to CLASS_IDLE and other queues are waiting for -+ * service. -+ */ -+ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) -+ goto return_rq; -+ -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ -+return_rq: -+ return rq; -+} -+ -+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ -+ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", -+ !list_empty_careful(&bfqd->dispatch), bfq_tot_busy_queues(bfqd) > 0); -+ -+ /* -+ * Avoiding lock: a race on bfqd->busy_queues should cause at -+ * most a call to dispatch for nothing -+ */ -+ return !list_empty_careful(&bfqd->dispatch) || -+ bfq_tot_busy_queues(bfqd) > 0; -+} -+ -+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq = NULL; -+ struct bfq_queue *bfqq = NULL; -+ -+ if (!list_empty(&bfqd->dispatch)) { -+ rq = list_first_entry(&bfqd->dispatch, struct request, -+ queuelist); -+ list_del_init(&rq->queuelist); -+ rq->rq_flags &= ~RQF_DISP_LIST; -+ -+ bfq_log(bfqd, -+ "picked %p from dispatch list", rq); -+ bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ /* -+ * Increment counters here, because this -+ * dispatch does not follow the standard -+ * dispatch flow (where counters are -+ * incremented) -+ */ -+ bfqq->dispatched++; -+ -+ /* -+ * TESTING: reset DISP_LIST flag, because: 1) -+ * this rq this request has passed through -+ * bfq_prepare_request, 2) then it will have -+ * bfq_finish_requeue_request invoked on it, and 3) in -+ * bfq_finish_requeue_request we use this flag to check -+ * that bfq_finish_requeue_request is not invoked on -+ * requests for which bfq_prepare_request has -+ * been invoked. -+ */ -+ rq->rq_flags &= ~RQF_DISP_LIST; -+ goto inc_in_driver_start_rq; -+ } -+ -+ /* -+ * We exploit the bfq_finish_requeue_request hook to decrement -+ * rq_in_driver, but bfq_finish_requeue_request will not be -+ * invoked on this request. So, to avoid unbalance, -+ * just start this request, without incrementing -+ * rq_in_driver. As a negative consequence, -+ * rq_in_driver is deceptively lower than it should be -+ * while this request is in service. This may cause -+ * bfq_schedule_dispatch to be invoked uselessly. -+ * -+ * As for implementing an exact solution, the -+ * bfq_finish_requeue_request hook, if defined, is probably -+ * invoked also on this request. So, by exploiting -+ * this hook, we could 1) increment rq_in_driver here, -+ * and 2) decrement it in bfq_finish_requeue_request. Such a -+ * solution would let the value of the counter be -+ * always accurate, but it would entail using an extra -+ * interface function. This cost seems higher than the -+ * benefit, being the frequency of non-elevator-private -+ * requests very low. -+ */ -+ goto start_rq; -+ } -+ -+ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); -+ -+ if (bfq_tot_busy_queues(bfqd) == 0) -+ goto exit; -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ goto exit; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ goto exit; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfq_bfqq_wait_request(bfqq)); -+ -+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (rq) { -+ inc_in_driver_start_rq: -+ bfqd->rq_in_driver++; -+ start_rq: -+ rq->rq_flags |= RQF_STARTED; -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "%s request %p, rq_in_driver %d", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async", -+ rq, -+ bfqd->rq_in_driver); -+ else -+ bfq_log(bfqd, -+ "request %p from dispatch list, rq_in_driver %d", -+ rq, bfqd->rq_in_driver); -+ } else -+ bfq_log(bfqd, -+ "returned NULL request, rq_in_driver %d", -+ bfqd->rq_in_driver); -+ -+exit: -+ return rq; -+} -+ -+ -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) -+{ -+ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -+ -+ if (!idle_timer_disabled && !bfqq) -+ return; -+ -+ /* -+ * rq and bfqq are guaranteed to exist until this function -+ * ends, for the following reasons. First, rq can be -+ * dispatched to the device, and then can be completed and -+ * freed, only after this function ends. Second, rq cannot be -+ * merged (and thus freed because of a merge) any longer, -+ * because it has already started. Thus rq cannot be freed -+ * before this function ends, and, since rq has a reference to -+ * bfqq, the same guarantee holds for bfqq too. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ if (idle_timer_disabled) -+ /* -+ * Since the idle timer has been disabled, -+ * in_serv_queue contained some request when -+ * __bfq_dispatch_request was invoked above, which -+ * implies that rq was picked exactly from -+ * in_serv_queue. Thus in_serv_queue == bfqq, and is -+ * therefore guaranteed to exist because of the above -+ * arguments. -+ */ -+ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); -+ if (bfqq) { -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+ -+ bfqg_stats_update_avg_queue_size(bfqg); -+ bfqg_stats_set_start_empty_time(bfqg); -+ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); -+ } -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) {} -+#endif -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ struct bfq_queue *in_serv_queue; -+ bool waiting_rq, idle_timer_disabled; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ -+ rq = __bfq_dispatch_request(hctx); -+ -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, -+ idle_timer_disabled); -+ -+ return rq; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Scheduler lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ assert_spin_locked(&bfqq->bfqd->lock); -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); -+ -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { -+ hlist_del_init(&bfqq->burst_list_node); -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; -+ } -+ -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); -+ bfqg_and_blkg_put(bfqg); -+#endif -+ kmem_cache_free(bfq_pool, bfqq); -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ struct bfq_data *bfqd; -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ -+ -+ if (bfqq && bfqd) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_exit_bfqq(bfqd, bfqq); -+ bic_set_bfqq(bic, NULL, is_sync); -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ BUG_ON(!bic); -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ WARN_ON(!bfqd); -+ if (!bfqd) -+ return; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ -+ if (!bfq_class_idle(bfqq)) -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ /* -+ * Aggressively inject a lot of service: up to 90%. -+ * This coefficient remains constant during bfqq life, -+ * but this behavior might be changed, after enough -+ * testing and tuning. -+ */ -+ bfqq->inject_coeff = 1; -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ -+ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+ -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. -+ */ -+ bfqq->soft_rt_next_start = jiffies; -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_ttime *ttime = &bfqq->ttime; -+ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); -+} -+ -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ bool has_short_ttime = true; -+ -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ (bfq_sample_valid(bfqq->ttime.ttime_samples) && -+ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", -+ has_short_ttime); -+ -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bfqq); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if -+ * - the request is small, and -+ * - we are idling to boost throughput, and -+ * - the queue is not to be expired, -+ * then just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. In contrast -+ * we wait for the block layer to decide when to -+ * unplug the device: hopefully, new requests will be -+ * merged to this one quickly, then the device will be -+ * unplugged and larger requests will be dispatched. -+ */ -+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && -+ !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or idling is being -+ * performed to preserve service guarantees, or -+ * finally the queue is to be expired: in all these -+ * cases disk idling is to be stopped, so clear -+ * wait_request flag and reset timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ } -+} -+ -+/* returns true if it causes the idle timer to be disabled */ -+static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ bool waiting, idle_timer_disabled = false; -+ BUG_ON(!bfqq); -+ -+ assert_spin_locked(&bfqd->lock); -+ -+ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ BUG_ON(bic_to_bfqq(RQ_BIC(rq), 1) != bfqq); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated++; -+ bfqq->allocated--; -+ bfq_log_bfqq(bfqd, bfqq, -+ "new allocated %d", bfqq->allocated); -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "new_bfqq new allocated %d", -+ bfqq->allocated); -+ -+ new_bfqq->ref++; -+ /* -+ * If the bic associated with the process -+ * issuing this request still points to bfqq -+ * (and thus has not been already redirected -+ * to new_bfqq or even some other bfq_queue), -+ * then complete the merge and redirect it to -+ * new_bfqq. -+ */ -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ waiting = bfqq && bfq_bfqq_wait_request(bfqq); -+ bfq_add_request(rq); -+ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+ -+ return idle_timer_disabled; -+} -+ -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) -+{ -+ if (!bfqq) -+ return; -+ -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instructions. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) {} -+#endif -+ -+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -+ bool at_head) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ bool idle_timer_disabled = false; -+ unsigned int cmd_flags; -+ -+ spin_lock_irq(&bfqd->lock); -+ if (blk_mq_sched_try_insert_merge(q, rq)) { -+ spin_unlock_irq(&bfqd->lock); -+ return; -+ } -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ blk_mq_sched_request_inserted(rq); -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ bfqq = bfq_init_rq(rq); -+ BUG_ON(!bfqq && !(at_head || blk_rq_is_passthrough(rq))); -+ BUG_ON(bfqq && bic_to_bfqq(RQ_BIC(rq), rq_is_sync(rq)) != bfqq); -+ -+ if (at_head || blk_rq_is_passthrough(rq)) { -+ if (at_head) -+ list_add(&rq->queuelist, &bfqd->dispatch); -+ else -+ list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ rq->rq_flags |= RQF_DISP_LIST; -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "%p in disp: at_head %d", -+ rq, at_head); -+ else -+ bfq_log(bfqd, -+ "%p in disp: at_head %d", -+ rq, at_head); -+ } else { /* bfqq is assumed to be non null here */ -+ BUG_ON(!bfqq); -+ BUG_ON(!(rq->rq_flags & RQF_GOT)); -+ rq->rq_flags &= ~RQF_GOT; -+ -+ idle_timer_disabled = __bfq_insert_request(bfqd, rq); -+ /* -+ * Update bfqq, because, if a queue merge has occurred -+ * in __bfq_insert_request, then rq has been -+ * redirected into a new queue. -+ */ -+ bfqq = RQ_BFQQ(rq); -+ -+ if (rq_mergeable(rq)) { -+ elv_rqhash_add(q, rq); -+ if (!q->last_merge) -+ q->last_merge = rq; -+ } -+ } -+ -+ /* -+ * Cache cmd_flags before releasing scheduler lock, because rq -+ * may disappear afterwards (for example, because of a request -+ * merge). -+ */ -+ cmd_flags = rq->cmd_flags; -+ -+ spin_unlock_irq(&bfqd->lock); -+ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, -+ cmd_flags); -+} -+ -+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -+ struct list_head *list, bool at_head) -+{ -+ while (!list_empty(list)) { -+ struct request *rq; -+ -+ rq = list_first_entry(list, struct request, queuelist); -+ list_del_init(&rq->queuelist); -+ bfq_insert_request(hctx, rq, at_head); -+ } -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ /* -+ * If active queue hasn't enough requests and can idle, bfq might not -+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this -+ * case -+ */ -+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && -+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < -+ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) -+{ -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "new disp %d, new rq_in_driver %d", -+ bfqq->dispatched, bfqd->rq_in_driver); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, bfqq); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ bfqq->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * do not compute soft_rt_next_start if bfqq is in interactive -+ * weight raising (see the comments in bfq_bfqq_expire() for -+ * an explanation). We schedule this delayed update when bfqq -+ * expires, if it still has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfq_bfqq_must_idle(bfqq)) { -+ if (bfqq->dispatched == 0) -+ bfq_arm_slice_timer(bfqd); -+ /* -+ * If we get here, we do not expire bfqq, even -+ * if bfqq was in budget timeout or had no -+ * more requests (as controlled in the next -+ * conditional instructions). The reason for -+ * not expiring bfqq is as follows. -+ * -+ * Here bfqq->dispatched > 0 holds, but -+ * bfq_bfqq_must_idle() returned true. This -+ * implies that, even if no request arrives -+ * for bfqq before bfqq->dispatched reaches 0, -+ * bfqq will, however, not be expired on the -+ * completion event that causes bfqq->dispatch -+ * to reach zero. In contrast, on this event, -+ * bfqq will start enjoying device idling -+ * (I/O-dispatch plugging). -+ * -+ * But, if we expired bfqq here, bfqq would -+ * not have the chance to enjoy device idling -+ * when bfqq->dispatched finally reaches -+ * zero. This would expose bfqq to violation -+ * of its reserved service guarantees. -+ */ -+ return; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_better_to_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+} -+ -+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "allocated %d", bfqq->allocated); -+ BUG_ON(!bfqq->allocated); -+ bfqq->allocated--; -+ -+ bfq_put_queue(bfqq); -+} -+ -+/* -+ * Handle either a requeue or a finish for rq. The things to do are -+ * the same in both cases: all references to rq are to be dropped. In -+ * particular, rq is considered completed from the point of view of -+ * the scheduler. -+ */ -+static void bfq_finish_requeue_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd; -+ struct bfq_io_cq *bic; -+ -+ BUG_ON(!rq); -+ -+ bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * Requeue and finish hooks are invoked in blk-mq without -+ * checking whether the involved request is actually still -+ * referenced in the scheduler. To handle this fact, the -+ * following two checks make this function exit in case of -+ * spurious invocations, for which there is nothing to do. -+ * -+ * First, check whether rq has nothing to do with an elevator. -+ */ -+ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) -+ return; -+ -+ /* -+ * rq either is not associated with any icq, or is an already -+ * requeued request that has not (yet) been re-inserted into -+ * a bfq_queue. -+ */ -+ if (!rq->elv.icq || !bfqq) -+ return; -+ -+ bic = RQ_BIC(rq); -+ BUG_ON(!bic); -+ -+ bfqd = bfqq->bfqd; -+ BUG_ON(!bfqd); -+ -+ if (rq->rq_flags & RQF_DISP_LIST) { -+ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); -+ BUG(); -+ } -+ BUG_ON(rq->rq_flags & RQF_QUEUED); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "putting rq %p with %u sects left, STARTED %d", -+ rq, blk_rq_sectors(rq), -+ rq->rq_flags & RQF_STARTED); -+ -+ if (rq->rq_flags & RQF_STARTED) -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq->start_time_ns, -+ rq->io_start_time_ns, -+ rq->cmd_flags); -+ -+ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ -+ if (likely(rq->rq_flags & RQF_STARTED)) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ -+ bfq_completed_request(bfqq, bfqd); -+ bfq_finish_requeue_request_body(bfqq); -+ -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } else { -+ /* -+ * Request rq may be still/already in the scheduler, -+ * in which case we need to remove it (this should -+ * never happen in case of requeue). And we cannot -+ * defer such a check and removal, to avoid -+ * inconsistencies in the time interval from the end -+ * of this function to the start of the deferred work. -+ * This situation seems to occur only in process -+ * context, as a consequence of a merge. In the -+ * current version of the code, this implies that the -+ * lock is held. -+ */ -+ BUG_ON(in_interrupt()); -+ -+ assert_spin_locked(&bfqd->lock); -+ if (!RB_EMPTY_NODE(&rq->rb_node)) { -+ bfq_remove_request(rq->q, rq); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), -+ rq->cmd_flags); -+ } -+ bfq_finish_requeue_request_body(bfqq); -+ } -+ -+ /* -+ * Reset private fields. In case of a requeue, this allows -+ * this function to correctly do nothing if it is spuriously -+ * invoked again on this same request (see the check at the -+ * beginning of the function). Probably, a better general -+ * design would be to prevent blk-mq from invoking the requeue -+ * or finish hooks of an elevator, for a request that is not -+ * referred by that elevator. -+ * -+ * Resetting the following fields would break the -+ * request-insertion logic if rq is re-inserted into a bfq -+ * internal queue, without a re-preparation. Here we assume -+ * that re-insertions of requeued requests, without -+ * re-preparation, can happen only for pass_through or at_head -+ * requests (which are not re-inserted into bfq internal -+ * queues). -+ */ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct bio *bio, -+ bool split, bool is_sync, -+ bool *new_queue) -+{ -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ -+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) -+ return bfqq; -+ -+ if (new_queue) -+ *new_queue = true; -+ -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ -+ return bfqq; -+} -+ -+/* -+ * Only reset private fields. The actual request preparation will be -+ * performed by bfq_init_rq, when rq is either inserted or merged. See -+ * comments on bfq_init_rq for the reason behind this delayed -+ * preparation. -+*/ -+static void bfq_prepare_request(struct request *rq, struct bio *bio) -+{ -+ /* -+ * Regardless of whether we have an icq attached, we have to -+ * clear the scheduler pointers, as they might point to -+ * previously allocated bic/bfqq structs. -+ */ -+ rq->elv.priv[0] = rq->elv.priv[1] = NULL; -+} -+ -+/* -+ * If needed, init rq, allocate bfq data structures associated with -+ * rq, and increment reference counters in the destination bfq_queue -+ * for rq. Return the destination bfq_queue for rq, or NULL is rq is -+ * not associated with any bfq_queue. -+ * -+ * This function is invoked by the functions that perform rq insertion -+ * or merging. One may have expected the above preparation operations -+ * to be performed in bfq_prepare_request, and not delayed to when rq -+ * is inserted or merged. The rationale behind this delayed -+ * preparation is that, after the prepare_request hook is invoked for -+ * rq, rq may still be transformed into a request with no icq, i.e., a -+ * request not associated with any queue. No bfq hook is invoked to -+ * signal this tranformation. As a consequence, should these -+ * preparation operations be performed when the prepare_request hook -+ * is invoked, and should rq be transformed one moment later, bfq -+ * would end up in an inconsistent state, because it would have -+ * incremented some queue counters for an rq destined to -+ * transformation, without any chance to correctly lower these -+ * counters back. In contrast, no transformation can still happen for -+ * rq after rq has been inserted or merged. So, it is safe to execute -+ * these preparation operations when rq is finally inserted or merged. -+ */ -+static struct bfq_queue *bfq_init_rq(struct request *rq) -+{ -+ struct request_queue *q = rq->q; -+ struct bio *bio = rq->bio; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic; -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ bool bfqq_already_existing = false, split = false; -+ bool new_queue = false; -+ -+ if (unlikely(!rq->elv.icq)) -+ return NULL; -+ -+ /* -+ * Assuming that elv.priv[1] is set only if everything is set -+ * for this rq. This holds true, because this function is -+ * invoked only for insertion or merging, and, after such -+ * events, a request cannot be manipulated any longer before -+ * being removed from bfq. -+ */ -+ if (rq->elv.priv[1]) { -+ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); -+ return rq->elv.priv[1]; -+ } -+ -+ bic = icq_to_bic(rq->elv.icq); -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, -+ &new_queue); -+ -+ if (likely(!new_queue)) { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ BUG_ON(!is_sync); -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ -+ if (!bfqq) -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -+ true, is_sync, -+ NULL); -+ else -+ bfqq_already_existing = true; -+ -+ BUG_ON(!bfqq); -+ BUG_ON(bfqq == &bfqd->oom_bfqq); -+ } -+ } -+ -+ bfqq->allocated++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new allocated %d", bfqq->allocated); -+ -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ rq->rq_flags &= ~RQF_DISP_LIST; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only this bic: we can then set bfqq->bic = bic. in -+ * addition, if the queue has also just been split, we have to -+ * resume its state. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * The queue has just been split from a shared -+ * queue: restore the idle window and the -+ * possible weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ rq->rq_flags |= RQF_GOT; -+ -+ return bfqq; -+} -+ -+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ enum bfqq_expiration reason; -+ unsigned long flags; -+ -+ BUG_ON(!bfqd); -+ spin_lock_irqsave(&bfqd->lock, flags); -+ -+ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfqq != bfqd->in_service_queue) { -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ return; -+ } -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ -+schedule_dispatch: -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_schedule_dispatch(bfqd); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ -+ bfq_log(bfqd, "expired"); -+ -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if a new request -+ * arrives for the current queue and there is a full dispatch -+ * cycle that changes the in-service queue. This can hardly -+ * happen, but in the worst case we just expire a queue too -+ * early. -+ */ -+ if (bfqq) -+ bfq_idle_slice_timer_body(bfqq); -+ -+ return HRTIMER_NORESTART; -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "%p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+/* -+ * See the comments on bfq_limit_depth for the purpose of -+ * the depths set in the function. Return minimum shallow depth we'll use. -+ */ -+static unsigned int bfq_update_depths(struct bfq_data *bfqd, -+ struct sbitmap_queue *bt) -+{ -+ unsigned int i, j, min_shallow = UINT_MAX; -+ -+ /* -+ * In-word depths if no bfq_queue is being weight-raised: -+ * leaving 25% of tags only for sync reads. -+ * -+ * In next formulas, right-shift the value -+ * (1U<<bt->sb.shift), instead of computing directly -+ * (1U<<(bt->sb.shift - something)), to be robust against -+ * any possible value of bt->sb.shift, without having to -+ * limit 'something'. -+ */ -+ /* no more than 50% of tags for async I/O */ -+ bfqd->word_depths[0][0] = max((1U<<bt->sb.shift)>>1, 1U); -+ /* -+ * no more than 75% of tags for sync writes (25% extra tags -+ * w.r.t. async I/O, to prevent async I/O from starving sync -+ * writes) -+ */ -+ bfqd->word_depths[0][1] = max(((1U<<bt->sb.shift) * 3)>>2, 1U); -+ -+ /* -+ * In-word depths in case some bfq_queue is being weight- -+ * raised: leaving ~63% of tags for sync reads. This is the -+ * highest percentage for which, in our tests, application -+ * start-up times didn't suffer from any regression due to tag -+ * shortage. -+ */ -+ /* no more than ~18% of tags for async I/O */ -+ bfqd->word_depths[1][0] = max(((1U<<bt->sb.shift) * 3)>>4, 1U); -+ /* no more than ~37% of tags for sync writes (~20% extra tags) */ -+ bfqd->word_depths[1][1] = max(((1U<<bt->sb.shift) * 6)>>4, 1U); -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < 2; j++) -+ min_shallow = min(min_shallow, bfqd->word_depths[i][j]); -+ -+ return min_shallow; -+} -+ -+static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct blk_mq_tags *tags = hctx->sched_tags; -+ unsigned int min_shallow; -+ -+ min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); -+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); -+} -+ -+static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) -+{ -+ bfq_depth_updated(hctx); -+ return 0; -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_log(bfqd, "starting ..."); -+ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ -+ BUG_ON(bfqd->in_service_queue); -+ BUG_ON(!list_empty(&bfqd->active_list)); -+ -+ spin_lock_irq(&bfqd->lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ spin_unlock_irq(&bfqd->lock); -+ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* release oom-queue reference to root group */ -+ bfqg_and_blkg_put(bfqd->root_group); -+ -+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); -+#else -+ spin_lock_irq(&bfqd->lock); -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+ spin_unlock_irq(&bfqd->lock); -+#endif -+ -+ bfq_log(bfqd, "finished ..."); -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ INIT_LIST_HEAD(&bfqd->dispatch); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->num_groups_with_pending_reqs = 0; -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device peak -+ * rate is equal to 2/3 of the highest reference rate. -+ */ -+ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * -+ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ -+ spin_lock_init(&bfqd->lock); -+ -+ /* -+ * The invocation of the next bfq_create_group_hierarchy -+ * function is the head of a chain of function calls -+ * (bfq_create_group_hierarchy->blkcg_activate_policy-> -+ * blk_mq_freeze_queue) that may lead to the invocation of the -+ * has_work hook function. For this reason, -+ * bfq_create_group_hierarchy is invoked only after all -+ * scheduler data has been initialized, apart from the fields -+ * that can be initialized only after invoking -+ * bfq_create_group_hierarchy. This, in particular, enables -+ * has_work to correctly return false. Of course, to avoid -+ * other inconsistencies, the blk-mq stack must then refrain -+ * from invoking further scheduler hooks before this init -+ * function is finished. -+ */ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ wbt_disable_default(q); -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq_mq = { -+ .ops.mq = { -+ .limit_depth = bfq_limit_depth, -+ .prepare_request = bfq_prepare_request, -+ .requeue_request = bfq_finish_requeue_request, -+ .finish_request = bfq_finish_requeue_request, -+ .exit_icq = bfq_exit_icq, -+ .insert_requests = bfq_insert_requests, -+ .dispatch_request = bfq_dispatch_request, -+ .next_request = elv_rb_latter_request, -+ .former_request = elv_rb_former_request, -+ .allow_merge = bfq_allow_bio_merge, -+ .bio_merge = bfq_bio_merge, -+ .request_merge = bfq_request_merge, -+ .requests_merged = bfq_requests_merged, -+ .request_merged = bfq_request_merged, -+ .has_work = bfq_has_work, -+ .depth_updated = bfq_depth_updated, -+ .init_hctx = bfq_init_hctx, -+ .init_sched = bfq_init_queue, -+ .exit_sched = bfq_exit_queue, -+ }, -+ -+ .uses_mq = true, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-mq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v9"; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definition of the next -+ * array). Actually, we use slightly lower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ ret = elv_register(&iosched_bfq_mq); -+ if (ret) -+ goto slab_kill; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+slab_kill: -+ bfq_slab_kill(); -+err_pol_unreg: -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq_mq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Paolo Valente"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -new file mode 100644 -index 000000000000..ceb291132a1a ---- /dev/null -+++ b/block/bfq-mq.h -@@ -0,0 +1,1077 @@ -+/* -+ * BFQ v9: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ -+#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue in a hierarchical setup. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active queues -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the queues this counter refers to */ -+ unsigned int num_active; /* nr of active queues with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+ -+ /* flag, set if the entity is counted in groups_with_pending_reqs */ -+ bool in_groups_with_pending_reqs; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of requests currently allocated */ -+ int allocated; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* pointer to the weight counter associated with this queue */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ -+ -+ /* max service rate measured so far */ -+ u32 max_service_rate; -+ /* -+ * Ratio between the service received by bfqq while it is in -+ * service, and the cumulative service (of requests of other -+ * queues) that may be injected while bfqq is empty but still -+ * in service. To increase precision, the coefficient is -+ * measured in tenths of unit. Here are some example of (1) -+ * ratios, (2) resulting percentages of service injected -+ * w.r.t. to the total service dispatched while bfqq is in -+ * service, and (3) corresponding values of the coefficient: -+ * 1 (50%) -> 10 -+ * 2 (33%) -> 20 -+ * 10 (9%) -> 100 -+ * 9.9 (9%) -> 99 -+ * 1.5 (40%) -> 15 -+ * 0.5 (66%) -> 5 -+ * 0.1 (90%) -> 1 -+ * -+ * So, if the coefficient is lower than 10, then -+ * injected service is more than bfqq service. -+ */ -+ unsigned int inject_coeff; -+ /* amount of service injected in current service slot */ -+ unsigned int injected_service; -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. -+ */ -+ bool saved_has_short_ttime; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+ struct bfq_ttime saved_ttime; -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by @lock. -+ */ -+struct bfq_data { -+ /* device request queue */ -+ struct request_queue *queue; -+ /* dispatch queue */ -+ struct list_head dispatch; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ -+ /* -+ * Number of groups with at least one descendant process that -+ * has at least one request waiting for completion. Note that -+ * this accounts for also requests already dispatched, but not -+ * yet completed. Therefore this number of groups may differ -+ * (be larger) than the number of active groups, as a group is -+ * considered active only if its corresponding entity has -+ * descendant queues with at least one request queued. This -+ * number is used to decide whether a scenario is symmetric. -+ * For a detailed explanation see comments on the computation -+ * of the variable asymmetric_scenario in the function -+ * bfq_better_to_idle(). -+ * -+ * However, it is hard to compute this number exactly, for -+ * groups with multiple descendant processes. Consider a group -+ * that is inactive, i.e., that has no descendant process with -+ * pending I/O inside BFQ queues. Then suppose that -+ * num_groups_with_pending_reqs is still accounting for this -+ * group, because the group has descendant processes with some -+ * I/O request still in flight. num_groups_with_pending_reqs -+ * should be decremented when the in-flight request of the -+ * last descendant process is finally completed (assuming that -+ * nothing else has changed for the group in the meantime, in -+ * terms of composition of the group and active/inactive state of child -+ * groups and processes). To accomplish this, an additional -+ * pending-request counter must be added to entities, and must -+ * be updated correctly. To avoid this additional field and operations, -+ * we resort to the following tradeoff between simplicity and -+ * accuracy: for an inactive group that is still counted in -+ * num_groups_with_pending_reqs, we decrement -+ * num_groups_with_pending_reqs when the first descendant -+ * process of the group remains with no request waiting for -+ * completion. -+ * -+ * Even this simpler decrement strategy requires a little -+ * carefulness: to avoid multiple decrements, we flag a group, -+ * more precisely an entity representing a group, as still -+ * counted in num_groups_with_pending_reqs when it becomes -+ * inactive. Then, when the first descendant queue of the -+ * entity remains with no request waiting for completion, -+ * num_groups_with_pending_reqs is decremented, and this flag -+ * is reset. After this flag is reset for the entity, -+ * num_groups_with_pending_reqs won't be decremented any -+ * longer in case a new descendant queue of the entity remains -+ * with no request waiting for completion. -+ */ -+ unsigned int num_groups_with_pending_reqs; -+ -+ /* -+ * Per-class (RT, BE, IDLE) number of bfq_queues containing -+ * requests (including the queue in service, even if it is -+ * idling). -+ */ -+ unsigned int busy_queues[3]; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* position of the last served request for the in-service queue */ -+ sector_t in_serv_last_pos; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product ref_rate*ref_wr_duration, used -+ * for computing the maximum duration of weight raising -+ * automatically. -+ */ -+ u64 rate_dur_prod; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+ -+ spinlock_t lock; -+ -+ /* -+ * bic associated with the task issuing current bio for -+ * merging. This and the next field are used as a support to -+ * be able to perform the bic lookup, needed by bio-merge -+ * functions, before the scheduler lock is taken, and thus -+ * avoid taking the request-queue lock while the scheduler -+ * lock is being held. -+ */ -+ struct bfq_io_cq *bio_bic; -+ /* bfqq associated with the task issuing current bio for merging */ -+ struct bfq_queue *bio_bfqq; -+ /* Extra flag used only for TESTING */ -+ bool bio_bfqq_set; -+ -+ /* -+ * Depth limits used in bfq_limit_depth (see comments on the -+ * function) -+ */ -+ unsigned int word_depths[2][2]; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(has_short_ttime); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ pr_crit("%s %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ bfqg->blkg_path, __func__, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __func__, ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ -+ __func__, ##args);\ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ u64 start_group_wait_time; -+ u64 start_idle_time; -+ u64 start_empty_time; -+ uint16_t flags; -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ -+ char blkg_path[128]; -+ -+ /* reference counter (see comments in bfq_bic_update_cgroup) */ -+ int ref; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) -+{ -+ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + -+ bfqd->busy_queues[2]; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -new file mode 100644 -index 000000000000..7a4923231106 ---- /dev/null -+++ b/block/bfq-sched.c -@@ -0,0 +1,2077 @@ -+/* -+ * BFQ: Hierarchical B-WF2Q+ scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -+{ -+ struct rb_node *node = tree->rb_node; -+ -+ return rb_entry(node, struct bfq_entity, rb_node); -+} -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration); -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); -+ -+/** -+ * bfq_update_next_in_service - update sd->next_in_service -+ * @sd: sched_data for which to perform the update. -+ * @new_entity: if not NULL, pointer to the entity whose activation, -+ * requeueing or repositionig triggered the invocation of -+ * this function. -+ * @expiration: id true, this function is being invoked after the -+ * expiration of the in-service entity -+ * -+ * This function is called to update sd->next_in_service, which, in -+ * its turn, may change as a consequence of the insertion or -+ * extraction of an entity into/from one of the active trees of -+ * sd. These insertions/extractions occur as a consequence of -+ * activations/deactivations of entities, with some activations being -+ * 'true' activations, and other activations being requeueings (i.e., -+ * implementing the second, requeueing phase of the mechanism used to -+ * reposition an entity in its active tree; see comments on -+ * __bfq_activate_entity and __bfq_requeue_entity for details). In -+ * both the last two activation sub-cases, new_entity points to the -+ * just activated or requeued entity. -+ * -+ * Returns true if sd->next_in_service changes in such a way that -+ * entity->parent may become the next_in_service for its parent -+ * entity. -+ */ -+static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *new_entity, -+ bool expiration) -+{ -+ struct bfq_entity *next_in_service = sd->next_in_service; -+ struct bfq_queue *bfqq; -+ bool parent_sched_may_change = false; -+ bool change_without_lookup = false; -+ -+ /* -+ * If this update is triggered by the activation, requeueing -+ * or repositiong of an entity that does not coincide with -+ * sd->next_in_service, then a full lookup in the active tree -+ * can be avoided. In fact, it is enough to check whether the -+ * just-modified entity has the same priority as -+ * sd->next_in_service, is eligible and has a lower virtual -+ * finish time than sd->next_in_service. If this compound -+ * condition holds, then the new entity becomes the new -+ * next_in_service. Otherwise no change is needed. -+ */ -+ if (new_entity && new_entity != sd->next_in_service) { -+ /* -+ * Flag used to decide whether to replace -+ * sd->next_in_service with new_entity. Tentatively -+ * set to true, and left as true if -+ * sd->next_in_service is NULL. -+ */ -+ change_without_lookup = true; -+ -+ /* -+ * If there is already a next_in_service candidate -+ * entity, then compare timestamps to decide whether -+ * to replace sd->service_tree with new_entity. -+ */ -+ if (next_in_service) { -+ unsigned int new_entity_class_idx = -+ bfq_class_idx(new_entity); -+ struct bfq_service_tree *st = -+ sd->service_tree + new_entity_class_idx; -+ -+ change_without_lookup = -+ (new_entity_class_idx == -+ bfq_class_idx(next_in_service) -+ && -+ !bfq_gt(new_entity->start, st->vtime) -+ && -+ bfq_gt(next_in_service->finish, -+ new_entity->finish)); -+ } -+ -+ if (change_without_lookup) { -+ next_in_service = new_entity; -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "chose without lookup"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -+ "chose without lookup"); -+ } -+#endif -+ } -+ } -+ -+ if (!change_without_lookup) /* lookup needed */ -+ next_in_service = bfq_lookup_next_entity(sd, expiration); -+ -+ if (next_in_service) { -+ bool new_budget_triggers_change = -+ bfq_update_parent_budget(next_in_service); -+ -+ parent_sched_may_change = !sd->next_in_service || -+ new_budget_triggers_change; -+ } -+ -+ sd->next_in_service = next_in_service; -+ -+ if (!next_in_service) -+ return parent_sched_may_change; -+ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "chosen this queue"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen this entity"); -+ } -+#endif -+ return parent_sched_may_change; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* both next loops stop at one of the child entities of the root group */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+/* -+ * For each iteration, compute parent in advance, so as to be safe if -+ * entity is deallocated during the iteration. Such a deallocation may -+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue -+ * containing entity. -+ */ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+/* -+ * Returns true if this budget changes may let next_in_service->parent -+ * become the next_in_service entity for its parent entity. -+ */ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ struct bfq_entity *bfqg_entity; -+ struct bfq_group *bfqg; -+ struct bfq_sched_data *group_sd; -+ bool ret = false; -+ -+ BUG_ON(!next_in_service); -+ -+ group_sd = next_in_service->sched_data; -+ -+ bfqg = container_of(group_sd, struct bfq_group, sched_data); -+ /* -+ * bfq_group's my_entity field is not NULL only if the group -+ * is not the root group. We must not touch the root entity -+ * as it must never become an in-service entity. -+ */ -+ bfqg_entity = bfqg->my_entity; -+ if (bfqg_entity) { -+ if (bfqg_entity->budget > next_in_service->budget) -+ ret = true; -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "old budg: %d, new budg: %d", -+ bfqg_entity->budget, next_in_service->budget); -+ bfqg_entity->budget = next_in_service->budget; -+ } -+ -+ return ret; -+} -+ -+/* -+ * This function tells whether entity stops being a candidate for next -+ * service, according to the restrictive definition of the field -+ * next_in_service. In particular, this function is invoked for an -+ * entity that is about to be set in service. -+ * -+ * If entity is a queue, then the entity is no longer a candidate for -+ * next service according to the that definition, because entity is -+ * about to become the in-service queue. This function then returns -+ * true if entity is a queue. -+ * -+ * In contrast, entity could still be a candidate for next service if -+ * it is not a queue, and has more than one active child. In fact, -+ * even if one of its children is about to be set in service, other -+ * active children may still be the next to serve, for the parent -+ * entity, even according to the above definition. As a consequence, a -+ * non-queue entity is not a candidate for next-service only if it has -+ * only one active child. And only if this condition holds, then this -+ * function returns true for a non-queue entity. -+ */ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ struct bfq_group *bfqg; -+ -+ if (bfq_entity_to_bfqq(entity)) -+ return true; -+ -+ bfqg = container_of(entity, struct bfq_group, entity); -+ -+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); -+ BUG_ON(bfqg->active_entities == 0); -+ /* -+ * The field active_entities does not always contain the -+ * actual number of active children entities: it happens to -+ * not account for the in-service entity in case the latter is -+ * removed from its active tree (which may get done after -+ * invoking the function bfq_no_longer_next_in_service in -+ * bfq_get_next_queue). Fortunately, here, i.e., while -+ * bfq_no_longer_next_in_service is not yet completed in -+ * bfq_get_next_queue, bfq_active_extract has not yet been -+ * invoked, and thus active_entities still coincides with the -+ * actual number of active entities. -+ */ -+ if (bfqg->active_entities == 1) -+ return true; -+ -+ return false; -+} -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = NULL) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (parent = NULL; entity ; entity = parent) -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ return false; -+} -+ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ return true; -+} -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+/* -+ * Shift for timestamp calculations. This actually limits the maximum -+ * service allowed in one timestamp delta (small shift values increase it), -+ * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time -+ * wraparounds. -+ */ -+#define WFQ_SERVICE_SHIFT 22 -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = NULL; -+ -+ BUG_ON(!entity); -+ -+ if (!entity->my_sched_data) -+ bfqq = container_of(entity, struct bfq_queue, entity); -+ -+ return bfqq; -+} -+ -+ -+/** -+ * bfq_delta - map service into the virtual time domain. -+ * @service: amount of service. -+ * @weight: scale factor (weight of an entity or weight sum). -+ */ -+static u64 bfq_delta(unsigned long service, unsigned long weight) -+{ -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; -+ -+ do_div(d, weight); -+ return d; -+} -+ -+/** -+ * bfq_calc_finish - assign the finish time to an entity. -+ * @entity: the entity to act upon. -+ * @service: the service to be charged to the entity. -+ */ -+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned long long start, finish, delta; -+ -+ BUG_ON(entity->weight == 0); -+ -+ entity->finish = entity->start + -+ bfq_delta(service, entity->weight); -+ -+ start = ((entity->start>>10)*1000)>>12; -+ finish = ((entity->finish>>10)*1000)>>12; -+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "group: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "group: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#endif -+ } -+} -+ -+/** -+ * bfq_entity_of - get an entity from a node. -+ * @node: the node field of the entity. -+ * -+ * Convert a node pointer to the relative entity. This is used only -+ * to simplify the logic of some functions and not as the generic -+ * conversion mechanism because, e.g., in the tree walking functions, -+ * the check for a %NULL value would be redundant. -+ */ -+static struct bfq_entity *bfq_entity_of(struct rb_node *node) -+{ -+ struct bfq_entity *entity = NULL; -+ -+ if (node) -+ entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ return entity; -+} -+ -+/** -+ * bfq_extract - remove an entity from a tree. -+ * @root: the tree root. -+ * @entity: the entity to remove. -+ */ -+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -+{ -+ BUG_ON(entity->tree != root); -+ -+ entity->tree = NULL; -+ rb_erase(&entity->rb_node, root); -+} -+ -+/** -+ * bfq_idle_extract - extract an entity from the idle tree. -+ * @st: the service tree of the owning @entity. -+ * @entity: the entity being removed. -+ */ -+static void bfq_idle_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *next; -+ -+ BUG_ON(entity->tree != &st->idle); -+ -+ if (entity == st->first_idle) { -+ next = rb_next(&entity->rb_node); -+ st->first_idle = bfq_entity_of(next); -+ } -+ -+ if (entity == st->last_idle) { -+ next = rb_prev(&entity->rb_node); -+ st->last_idle = bfq_entity_of(next); -+ } -+ -+ bfq_extract(&st->idle, entity); -+ -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+} -+ -+/** -+ * bfq_insert - generic tree insertion. -+ * @root: tree root. -+ * @entity: entity to insert. -+ * -+ * This is used for the idle and the active tree, since they are both -+ * ordered by finish time. -+ */ -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -+{ -+ struct bfq_entity *entry; -+ struct rb_node **node = &root->rb_node; -+ struct rb_node *parent = NULL; -+ -+ BUG_ON(entity->tree); -+ -+ while (*node) { -+ parent = *node; -+ entry = rb_entry(parent, struct bfq_entity, rb_node); -+ -+ if (bfq_gt(entry->finish, entity->finish)) -+ node = &parent->rb_left; -+ else -+ node = &parent->rb_right; -+ } -+ -+ rb_link_node(&entity->rb_node, parent, node); -+ rb_insert_color(&entity->rb_node, root); -+ -+ entity->tree = root; -+} -+ -+/** -+ * bfq_update_min - update the min_start field of a entity. -+ * @entity: the entity to update. -+ * @node: one of its children. -+ * -+ * This function is called when @entity may store an invalid value for -+ * min_start due to updates to the active tree. The function assumes -+ * that the subtree rooted at @node (which may be its left or its right -+ * child) has a valid min_start value. -+ */ -+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -+{ -+ struct bfq_entity *child; -+ -+ if (node) { -+ child = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entity->min_start, child->min_start)) -+ entity->min_start = child->min_start; -+ } -+} -+ -+/** -+ * bfq_update_active_node - recalculate min_start. -+ * @node: the node to update. -+ * -+ * @node may have changed position or one of its children may have moved, -+ * this function updates its min_start value. The left and right subtrees -+ * are assumed to hold a correct min_start value. -+ */ -+static void bfq_update_active_node(struct rb_node *node) -+{ -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->min_start = entity->start; -+ bfq_update_min(entity, node->rb_right); -+ bfq_update_min(entity, node->rb_left); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#endif -+ } -+} -+ -+/** -+ * bfq_update_active_tree - update min_start for the whole active tree. -+ * @node: the starting node. -+ * -+ * @node must be the deepest modified node after an update. This function -+ * updates its min_start using the values held by its children, assuming -+ * that they did not change, and then updates all the nodes that may have -+ * changed in the path to the root. The only nodes that may have changed -+ * are the ones in the path or their siblings. -+ */ -+static void bfq_update_active_tree(struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+up: -+ bfq_update_active_node(node); -+ -+ parent = rb_parent(node); -+ if (!parent) -+ return; -+ -+ if (node == parent->rb_left && parent->rb_right) -+ bfq_update_active_node(parent->rb_right); -+ else if (parent->rb_left) -+ bfq_update_active_node(parent->rb_left); -+ -+ node = parent; -+ goto up; -+} -+ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root); -+ -+static void __bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root); -+ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ -+ -+/** -+ * bfq_active_insert - insert an entity in the active tree of its -+ * group/device. -+ * @st: the service tree of the entity. -+ * @entity: the entity being inserted. -+ * -+ * The active tree is ordered by finish time, but an extra key is kept -+ * per each node, containing the minimum value for the start times of -+ * its children (and the node itself), so it's possible to search for -+ * the eligible node with the lowest finish time in logarithmic time. -+ */ -+static void bfq_active_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node = &entity->rb_node; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ bfq_insert(&st->active, entity); -+ -+ if (node->rb_left) -+ node = node->rb_left; -+ else if (node->rb_right) -+ node = node->rb_right; -+ -+ bfq_update_active_tree(node); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ bfqg->active_entities++; -+ } -+#endif -+} -+ -+/** -+ * bfq_ioprio_to_weight - calc a weight from an ioprio. -+ * @ioprio: the ioprio value to convert. -+ */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) -+{ -+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; -+} -+ -+/** -+ * bfq_weight_to_ioprio - calc an ioprio from a weight. -+ * @weight: the weight value to convert. -+ * -+ * To preserve as much as possible the old only-ioprio user interface, -+ * 0 is used as an escape ioprio value for weights (numerically) equal or -+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. -+ */ -+static unsigned short bfq_weight_to_ioprio(int weight) -+{ -+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? -+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -+} -+ -+static void bfq_get_entity(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ if (bfqq) { -+ bfqq->ref++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", -+ bfqq, bfqq->ref); -+ } -+} -+ -+/** -+ * bfq_find_deepest - find the deepest node that an extraction can modify. -+ * @node: the node being removed. -+ * -+ * Do the first step of an extraction in an rb tree, looking for the -+ * node that will replace @node, and returning the deepest node that -+ * the following modifications to the tree can touch. If @node is the -+ * last node in the tree return %NULL. -+ */ -+static struct rb_node *bfq_find_deepest(struct rb_node *node) -+{ -+ struct rb_node *deepest; -+ -+ if (!node->rb_right && !node->rb_left) -+ deepest = rb_parent(node); -+ else if (!node->rb_right) -+ deepest = node->rb_left; -+ else if (!node->rb_left) -+ deepest = node->rb_right; -+ else { -+ deepest = rb_next(node); -+ if (deepest->rb_right) -+ deepest = deepest->rb_right; -+ else if (rb_parent(deepest) != node) -+ deepest = rb_parent(deepest); -+ } -+ -+ return deepest; -+} -+ -+/** -+ * bfq_active_extract - remove an entity from the active tree. -+ * @st: the service_tree containing the tree. -+ * @entity: the entity being removed. -+ */ -+static void bfq_active_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ node = bfq_find_deepest(&entity->rb_node); -+ bfq_extract(&st->active, entity); -+ -+ if (node) -+ bfq_update_active_tree(node); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ BUG_ON(!bfqg->active_entities); -+ bfqg->active_entities--; -+ } -+#endif -+} -+ -+/** -+ * bfq_idle_insert - insert an entity into the idle tree. -+ * @st: the service tree containing the tree. -+ * @entity: the entity to insert. -+ */ -+static void bfq_idle_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) -+ st->first_idle = entity; -+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) -+ st->last_idle = entity; -+ -+ bfq_insert(&st->idle, entity); -+ -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -+} -+ -+/** -+ * bfq_forget_entity - do not consider entity any longer for scheduling -+ * @st: the service tree. -+ * @entity: the entity being removed. -+ * @is_in_service: true if entity is currently the in-service entity. -+ * -+ * Forget everything about @entity. In addition, if entity represents -+ * a queue, and the latter is not in service, then release the service -+ * reference to the queue (the one taken through bfq_get_entity). In -+ * fact, in this case, there is really no more service reference to -+ * the queue, as the latter is also outside any service tree. If, -+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service -+ * will take care of putting the reference when the queue finally -+ * stops being served. -+ */ -+static void bfq_forget_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity, -+ bool is_in_service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!entity->on_st); -+ -+ entity->on_st = false; -+ st->wsum -= entity->weight; -+ if (bfqq && !is_in_service) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/** -+ * bfq_put_idle_entity - release the idle tree ref of an entity. -+ * @st: service tree for the entity. -+ * @entity: the entity being released. -+ */ -+static void bfq_put_idle_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ bfq_idle_extract(st, entity); -+ bfq_forget_entity(st, entity, -+ entity == entity->sched_data->in_service_entity); -+} -+ -+/** -+ * bfq_forget_idle - update the idle tree if necessary. -+ * @st: the service tree to act upon. -+ * -+ * To preserve the global O(log N) complexity we only remove one entry here; -+ * as the idle tree will not grow indefinitely this can be done safely. -+ */ -+static void bfq_forget_idle(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (RB_EMPTY_ROOT(&st->active) && last_idle && -+ !bfq_gt(last_idle->finish, st->vtime)) { -+ /* -+ * Forget the whole idle tree, increasing the vtime past -+ * the last finish time of idle entities. -+ */ -+ st->vtime = last_idle->finish; -+ } -+ -+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) -+ bfq_put_idle_entity(st, first_idle); -+} -+ -+/* -+ * Update weight and priority of entity. If update_class_too is true, -+ * then update the ioprio_class of entity too. -+ * -+ * The reason why the update of ioprio_class is controlled through the -+ * last parameter is as follows. Changing the ioprio class of an -+ * entity implies changing the destination service trees for that -+ * entity. If such a change occurred when the entity is already on one -+ * of the service trees for its previous class, then the state of the -+ * entity would become more complex: none of the new possible service -+ * trees for the entity, according to bfq_entity_service_tree(), would -+ * match any of the possible service trees on which the entity -+ * is. Complex operations involving these trees, such as entity -+ * activations and deactivations, should take into account this -+ * additional complexity. To avoid this issue, this function is -+ * invoked with update_class_too unset in the points in the code where -+ * entity may happen to be on some tree. -+ */ -+static struct bfq_service_tree * -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, -+ struct bfq_entity *entity, -+ bool update_class_too) -+{ -+ struct bfq_service_tree *new_st = old_st; -+ -+ if (entity->prio_changed) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int prev_weight, new_weight; -+ struct bfq_data *bfqd = NULL; -+ struct rb_root *root; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_sched_data *sd; -+ struct bfq_group *bfqg; -+#endif -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ sd = entity->my_sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+ BUG_ON(!bfqd); -+ } -+#endif -+ -+ BUG_ON(entity->tree && update_class_too); -+ BUG_ON(old_st->wsum < entity->weight); -+ old_st->wsum -= entity->weight; -+ -+ if (entity->new_weight != entity->orig_weight) { -+ if (entity->new_weight < BFQ_MIN_WEIGHT || -+ entity->new_weight > BFQ_MAX_WEIGHT) { -+ pr_crit("update_weight_prio: new_weight %d\n", -+ entity->new_weight); -+ if (entity->new_weight < BFQ_MIN_WEIGHT) -+ entity->new_weight = BFQ_MIN_WEIGHT; -+ else -+ entity->new_weight = BFQ_MAX_WEIGHT; -+ } -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) -+ bfqq->ioprio = -+ bfq_weight_to_ioprio(entity->orig_weight); -+ } -+ -+ if (bfqq && update_class_too) -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ -+ /* -+ * Reset prio_changed only if the ioprio_class change -+ * is not pending any longer. -+ */ -+ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) -+ entity->prio_changed = 0; -+ -+ /* -+ * NOTE: here we may be changing the weight too early, -+ * this will cause unfairness. The correct approach -+ * would have required additional complexity to defer -+ * weight changes to the proper time instants (i.e., -+ * when entity->finish <= old_st->vtime). -+ */ -+ new_st = bfq_entity_service_tree(entity); -+ -+ prev_weight = entity->weight; -+ new_weight = entity->orig_weight * -+ (bfqq ? bfqq->wr_coeff : 1); -+ /* -+ * If the weight of the entity changes and the entity is a -+ * queue, remove the entity from its old weight counter (if -+ * there is a counter associated with the entity). -+ */ -+ if (prev_weight != new_weight && bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "weight changed %d %d(%d %d)", -+ prev_weight, new_weight, -+ entity->orig_weight, -+ bfqq->wr_coeff); -+ -+ root = &bfqd->queue_weights_tree; -+ __bfq_weights_tree_remove(bfqd, bfqq, root); -+ } -+ entity->weight = new_weight; -+ /* -+ * Add the entity, if it is not a weight-raised queue, to the -+ * counter associated with its new weight. -+ */ -+ if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { -+ /* If we get here, root has been initialized. */ -+ bfq_weights_tree_add(bfqd, bfqq, root); -+ } -+ -+ new_st->wsum += entity->weight; -+ -+ if (new_st != old_st) { -+ BUG_ON(!update_class_too); -+ entity->start = new_st->vtime; -+ } -+ } -+ -+ return new_st; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -+#endif -+ -+/** -+ * bfq_bfqq_served - update the scheduler status after selection for -+ * service. -+ * @bfqq: the queue being served. -+ * @served: bytes to transfer. -+ * -+ * NOTE: this can be optimized, as the timestamps of upper level entities -+ * are synchronized every time a new bfqq is selected for service. By now, -+ * we keep it to better check consistency. -+ */ -+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st; -+ -+ if (!bfqq->service_from_backlogged) -+ bfqq->first_IO_time = jiffies; -+ -+ if (bfqq->wr_coeff > 1) -+ bfqq->service_from_wr += served; -+ -+ bfqq->service_from_backlogged += served; -+ for_each_entity(entity) { -+ st = bfq_entity_service_tree(entity); -+ -+ entity->service += served; -+ -+ BUG_ON(st->wsum == 0); -+ -+ st->vtime += bfq_delta(served, st->wsum); -+ bfq_forget_idle(st); -+ } -+#ifndef BFQ_MQ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -+#endif -+#endif -+ st = bfq_entity_service_tree(&bfqq->entity); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", -+ served, ((st->vtime>>10)*1000)>>12, st); -+} -+ -+/** -+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length -+ * of the time interval during which bfqq has been in -+ * service. -+ * @bfqd: the device -+ * @bfqq: the queue that needs a service update. -+ * @time_ms: the amount of time during which the queue has received service -+ * -+ * If a queue does not consume its budget fast enough, then providing -+ * the queue with service fairness may impair throughput, more or less -+ * severely. For this reason, queues that consume their budget slowly -+ * are provided with time fairness instead of service fairness. This -+ * goal is achieved through the BFQ scheduling engine, even if such an -+ * engine works in the service, and not in the time domain. The trick -+ * is charging these queues with an inflated amount of service, equal -+ * to the amount of service that they would have received during their -+ * service slot if they had been fast, i.e., if their requests had -+ * been dispatched at a rate equal to the estimated peak rate. -+ * -+ * It is worth noting that time fairness can cause important -+ * distortions in terms of bandwidth distribution, on devices with -+ * internal queueing. The reason is that I/O requests dispatched -+ * during the service slot of a queue may be served after that service -+ * slot is finished, and may have a total processing time loosely -+ * correlated with the duration of the service slot. This is -+ * especially true for short service slots. -+ */ -+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ unsigned long time_ms) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); -+ unsigned long bounded_time_ms = min(time_ms, timeout_ms); -+ int serv_to_charge_for_time = -+ (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; -+ int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "%lu/%lu ms, %d/%d/%d/%d sectors", -+ time_ms, timeout_ms, -+ entity->service, -+ tot_serv_to_charge, -+ bfqd->bfq_max_budget, -+ entity->budget); -+ -+ /* Increase budget to avoid inconsistencies */ -+ if (tot_serv_to_charge > entity->budget) -+ entity->budget = tot_serv_to_charge; -+ -+ bfq_bfqq_served(bfqq, -+ max_t(int, 0, tot_serv_to_charge - entity->service)); -+} -+ -+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, -+ struct bfq_service_tree *st, -+ bool backshifted) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ /* -+ * When this function is invoked, entity is not in any service -+ * tree, then it is safe to invoke next function with the last -+ * parameter set (see the comments on the function). -+ */ -+ BUG_ON(entity->tree); -+ st = __bfq_entity_update_weight_prio(st, entity, true); -+ bfq_calc_finish(entity, entity->budget); -+ -+ /* -+ * If some queues enjoy backshifting for a while, then their -+ * (virtual) finish timestamps may happen to become lower and -+ * lower than the system virtual time. In particular, if -+ * these queues often happen to be idle for short time -+ * periods, and during such time periods other queues with -+ * higher timestamps happen to be busy, then the backshifted -+ * timestamps of the former queues can become much lower than -+ * the system virtual time. In fact, to serve the queues with -+ * higher timestamps while the ones with lower timestamps are -+ * idle, the system virtual time may be pushed-up to much -+ * higher values than the finish timestamps of the idle -+ * queues. As a consequence, the finish timestamps of all new -+ * or newly activated queues may end up being much larger than -+ * those of lucky queues with backshifted timestamps. The -+ * latter queues may then monopolize the device for a lot of -+ * time. This would simply break service guarantees. -+ * -+ * To reduce this problem, push up a little bit the -+ * backshifted timestamps of the queue associated with this -+ * entity (only a queue can happen to have the backshifted -+ * flag set): just enough to let the finish timestamp of the -+ * queue be equal to the current value of the system virtual -+ * time. This may introduce a little unfairness among queues -+ * with backshifted timestamps, but it does not break -+ * worst-case fairness guarantees. -+ * -+ * As a special case, if bfqq is weight-raised, push up -+ * timestamps much less, to keep very low the probability that -+ * this push up causes the backshifted finish timestamps of -+ * weight-raised queues to become higher than the backshifted -+ * finish timestamps of non weight-raised queues. -+ */ -+ if (backshifted && bfq_gt(st->vtime, entity->finish)) { -+ unsigned long delta = st->vtime - entity->finish; -+ -+ if (bfqq) -+ delta /= bfqq->wr_coeff; -+ -+ entity->start += delta; -+ entity->finish += delta; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new queue finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "new group finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#endif -+ } -+ } -+ -+ bfq_active_insert(st, entity); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "queue %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "group %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#endif -+ } -+ BUG_ON(RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(&st->active != &sd->service_tree->active && -+ &st->active != &(sd->service_tree+1)->active && -+ &st->active != &(sd->service_tree+2)->active); -+} -+ -+/** -+ * __bfq_activate_entity - handle activation of entity. -+ * @entity: the entity being activated. -+ * @non_blocking_wait_rq: true if entity was waiting for a request -+ * -+ * Called for a 'true' activation, i.e., if entity is not active and -+ * one of its children receives a new request. -+ * -+ * Basically, this function updates the timestamps of entity and -+ * inserts entity into its active tree, after possibly extracting it -+ * from its idle tree. -+ */ -+static void __bfq_activate_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ bool backshifted = false; -+ unsigned long long min_vstart; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ /* See comments on bfq_fqq_update_budg_for_activation */ -+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { -+ backshifted = true; -+ min_vstart = entity->finish; -+ } else -+ min_vstart = st->vtime; -+ -+ if (entity->tree == &st->idle) { -+ /* -+ * Must be on the idle tree, bfq_idle_extract() will -+ * check for that. -+ */ -+ bfq_idle_extract(st, entity); -+ BUG_ON(entity->tree); -+ entity->start = bfq_gt(min_vstart, entity->finish) ? -+ min_vstart : entity->finish; -+ } else { -+ BUG_ON(entity->tree); -+ /* -+ * The finish time of the entity may be invalid, and -+ * it is in the past for sure, otherwise the queue -+ * would have been on the idle tree. -+ */ -+ entity->start = min_vstart; -+ st->wsum += entity->weight; -+ /* -+ * entity is about to be inserted into a service tree, -+ * and then set in service: get a reference to make -+ * sure entity does not disappear until it is no -+ * longer in service or scheduled for service. -+ */ -+ bfq_get_entity(entity); -+ -+ BUG_ON(entity->on_st && bfqq); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (entity->on_st && !bfqq) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, -+ bfqg, -+ "activate bug, class %d in_service %p", -+ bfq_class_idx(entity), sd->in_service_entity); -+ } -+#endif -+ BUG_ON(entity->on_st && !bfqq); -+ entity->on_st = true; -+ } -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ struct bfq_data *bfqd = bfqg->bfqd; -+ -+ BUG_ON(!bfqd); -+ if (!entity->in_groups_with_pending_reqs) { -+ entity->in_groups_with_pending_reqs = true; -+ bfqd->num_groups_with_pending_reqs++; -+ } -+ bfq_log_bfqg(bfqd, bfqg, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+ } -+#endif -+ -+ bfq_update_fin_time_enqueue(entity, st, backshifted); -+} -+ -+/** -+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. -+ * @entity: the entity being requeued or repositioned. -+ * -+ * Requeueing is needed if this entity stops being served, which -+ * happens if a leaf descendant entity has expired. On the other hand, -+ * repositioning is needed if the next_inservice_entity for the child -+ * entity has changed. See the comments inside the function for -+ * details. -+ * -+ * Basically, this function: 1) removes entity from its active tree if -+ * present there, 2) updates the timestamps of entity and 3) inserts -+ * entity back into its active tree (in the new, right position for -+ * the new values of the timestamps). -+ */ -+static void __bfq_requeue_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree != &st->active); -+ -+ if (entity == sd->in_service_entity) { -+ /* -+ * We are requeueing the current in-service entity, -+ * which may have to be done for one of the following -+ * reasons: -+ * - entity represents the in-service queue, and the -+ * in-service queue is being requeued after an -+ * expiration; -+ * - entity represents a group, and its budget has -+ * changed because one of its child entities has -+ * just been either activated or requeued for some -+ * reason; the timestamps of the entity need then to -+ * be updated, and the entity needs to be enqueued -+ * or repositioned accordingly. -+ * -+ * In particular, before requeueing, the start time of -+ * the entity must be moved forward to account for the -+ * service that the entity has received while in -+ * service. This is done by the next instructions. The -+ * finish time will then be updated according to this -+ * new value of the start time, and to the budget of -+ * the entity. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree == &st->idle); -+ BUG_ON(entity->tree && entity->tree != &st->active); -+ /* -+ * In addition, if the entity had more than one child -+ * when set in service, then it was not extracted from -+ * the active tree. This implies that the position of -+ * the entity in the active tree may need to be -+ * changed now, because we have just updated the start -+ * time of the entity, and we will update its finish -+ * time in a moment (the requeueing is then, more -+ * precisely, a repositioning in this case). To -+ * implement this repositioning, we: 1) dequeue the -+ * entity here, 2) update the finish time and requeue -+ * the entity according to the new timestamps below. -+ */ -+ if (entity->tree) -+ bfq_active_extract(st, entity); -+ } else { /* The entity is already active, and not in service */ -+ /* -+ * In this case, this function gets called only if the -+ * next_in_service entity below this entity has -+ * changed, and this change has caused the budget of -+ * this entity to change, which, finally implies that -+ * the finish time of this entity must be -+ * updated. Such an update may cause the scheduling, -+ * i.e., the position in the active tree, of this -+ * entity to change. We handle this change by: 1) -+ * dequeueing the entity here, 2) updating the finish -+ * time and requeueing the entity according to the new -+ * timestamps below. This is the same approach as the -+ * non-extracted-entity sub-case above. -+ */ -+ bfq_active_extract(st, entity); -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, false); -+} -+ -+static void __bfq_activate_requeue_entity(struct bfq_entity *entity, -+ struct bfq_sched_data *sd, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (sd->in_service_entity == entity || entity->tree == &st->active) -+ /* -+ * in service or already queued on the active tree, -+ * requeue or reposition -+ */ -+ __bfq_requeue_entity(entity); -+ else -+ /* -+ * Not in service and not queued on its active tree: -+ * the activity is idle and this is a true activation. -+ */ -+ __bfq_activate_entity(entity, non_blocking_wait_rq); -+} -+ -+ -+/** -+ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. -+ * @entity: the entity to activate. -+ * @non_blocking_wait_rq: true if this entity was waiting for a request -+ * @requeue: true if this is a requeue, which implies that bfqq is -+ * being expired; thus ALL its ancestors stop being served and must -+ * therefore be requeued -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue -+ */ -+static void bfq_activate_requeue_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq, -+ bool requeue, bool expiration) -+{ -+ struct bfq_sched_data *sd; -+ -+ for_each_entity(entity) { -+ BUG_ON(!entity); -+ sd = entity->sched_data; -+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); -+ -+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); -+ -+ if (!bfq_update_next_in_service(sd, entity, expiration) && -+ !requeue) { -+ BUG_ON(!sd->next_in_service); -+ break; -+ } -+ BUG_ON(!sd->next_in_service); -+ } -+} -+ -+/** -+ * __bfq_deactivate_entity - update sched_data and service trees for -+ * entity, so as to represent entity as inactive -+ * @entity: the entity being deactivated. -+ * @ins_into_idle_tree: if false, the entity will not be put into the -+ * idle tree. -+ * -+ * If necessary and allowed, puts entity into the idle tree. NOTE: -+ * entity may be on no tree if in service. -+ */ -+static bool __bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st; -+ bool is_in_service; -+ -+ if (!entity->on_st) { /* entity never activated, or already inactive */ -+ BUG_ON(sd && entity == sd->in_service_entity); -+ return false; -+ } -+ -+ /* -+ * If we get here, then entity is active, which implies that -+ * bfq_group_set_parent has already been invoked for the group -+ * represented by entity. Therefore, the field -+ * entity->sched_data has been set, and we can safely use it. -+ */ -+ st = bfq_entity_service_tree(entity); -+ is_in_service = entity == sd->in_service_entity; -+ -+ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); -+ -+ bfq_calc_finish(entity, entity->service); -+ -+ if (is_in_service) { -+ sd->in_service_entity = NULL; -+ } else -+ /* -+ * Non in-service entity: nobody will take care of -+ * resetting its service counter on expiration. Do it -+ * now. -+ */ -+ entity->service = 0; -+ -+ if (entity->tree == &st->active) -+ bfq_active_extract(st, entity); -+ else if (!is_in_service && entity->tree == &st->idle) -+ bfq_idle_extract(st, entity); -+ else if (entity->tree) -+ BUG(); -+ -+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity, is_in_service); -+ else -+ bfq_idle_insert(st, entity); -+ -+ return true; -+} -+ -+/** -+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: true if the entity can be put into the idle tree -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue -+ */ -+static void bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree, -+ bool expiration) -+{ -+ struct bfq_sched_data *sd; -+ struct bfq_entity *parent = NULL; -+ -+ for_each_entity_safe(entity, parent) { -+ sd = entity->sched_data; -+ -+ BUG_ON(sd == NULL); /* -+ * It would mean that this is the -+ * root group. -+ */ -+ -+ BUG_ON(expiration && entity != sd->in_service_entity); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree == -+ &bfq_entity_service_tree(entity)->active && -+ !sd->next_in_service); -+ -+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { -+ /* -+ * entity is not in any tree any more, so -+ * this deactivation is a no-op, and there is -+ * nothing to change for upper-level entities -+ * (in case of expiration, this can never -+ * happen). -+ */ -+ BUG_ON(expiration); /* -+ * entity cannot be already out of -+ * any tree -+ */ -+ return; -+ } -+ -+ if (sd->next_in_service == entity) -+ /* -+ * entity was the next_in_service entity, -+ * then, since entity has just been -+ * deactivated, a new one must be found. -+ */ -+ bfq_update_next_in_service(sd, NULL, expiration); -+ -+ if (sd->next_in_service || sd->in_service_entity) { -+ /* -+ * The parent entity is still active, because -+ * either next_in_service or in_service_entity -+ * is not NULL. So, no further upwards -+ * deactivation must be performed. Yet, -+ * next_in_service has changed. Then the -+ * schedule does need to be updated upwards. -+ * -+ * NOTE If in_service_entity is not NULL, then -+ * next_in_service may happen to be NULL, -+ * although the parent entity is evidently -+ * active. This happens if 1) the entity -+ * pointed by in_service_entity is the only -+ * active entity in the parent entity, and 2) -+ * according to the definition of -+ * next_in_service, the in_service_entity -+ * cannot be considered as -+ * next_in_service. See the comments on the -+ * definition of next_in_service for details. -+ */ -+ BUG_ON(sd->next_in_service == entity); -+ BUG_ON(sd->in_service_entity == entity); -+ break; -+ } -+ -+ /* -+ * If we get here, then the parent is no more -+ * backlogged and we need to propagate the -+ * deactivation upwards. Thus let the loop go on. -+ */ -+ -+ /* -+ * Also let parent be queued into the idle tree on -+ * deactivation, to preserve service guarantees, and -+ * assuming that who invoked this function does not -+ * need parent entities too to be removed completely. -+ */ -+ ins_into_idle_tree = true; -+ } -+ -+ /* -+ * If the deactivation loop is fully executed, then there are -+ * no more entities to touch and next loop is not executed at -+ * all. Otherwise, requeue remaining entities if they are -+ * about to stop receiving service, or reposition them if this -+ * is not the case. -+ */ -+ entity = parent; -+ for_each_entity(entity) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ /* -+ * Invoke __bfq_requeue_entity on entity, even if -+ * already active, to requeue/reposition it in the -+ * active tree (because sd->next_in_service has -+ * changed) -+ */ -+ __bfq_requeue_entity(entity); -+ -+ sd = entity->sched_data; -+ BUG_ON(expiration && sd->in_service_entity != entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "invoking udpdate_next for this queue"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "invoking udpdate_next for this entity"); -+ } -+#endif -+ if (!bfq_update_next_in_service(sd, entity, expiration) && -+ !expiration) -+ /* -+ * next_in_service unchanged or not causing -+ * any change in entity->parent->sd, and no -+ * requeueing needed for expiration: stop -+ * here. -+ */ -+ break; -+ } -+} -+ -+/** -+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, -+ * if needed, to have at least one entity eligible. -+ * @st: the service tree to act upon. -+ * -+ * Assumes that st is not empty. -+ */ -+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); -+ -+ if (bfq_gt(root_entity->min_start, st->vtime)) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new value %llu", -+ ((root_entity->min_start>>10)*1000)>>12); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(root_entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "new value %llu", -+ ((root_entity->min_start>>10)*1000)>>12); -+ } -+#endif -+ return root_entity->min_start; -+ } -+ return st->vtime; -+} -+ -+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -+{ -+ if (new_value > st->vtime) { -+ st->vtime = new_value; -+ bfq_forget_idle(st); -+ } -+} -+ -+/** -+ * bfq_first_active_entity - find the eligible entity with -+ * the smallest finish time -+ * @st: the service tree to select from. -+ * @vtime: the system virtual to use as a reference for eligibility -+ * -+ * This function searches the first schedulable entity, starting from the -+ * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path on -+ * the right is followed only if a) the left subtree contains no eligible -+ * entities and b) no eligible entity has been found yet. -+ */ -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, -+ u64 vtime) -+{ -+ struct bfq_entity *entry, *first = NULL; -+ struct rb_node *node = st->active.rb_node; -+ -+ while (node) { -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+left: -+ if (!bfq_gt(entry->start, vtime)) -+ first = entry; -+ -+ BUG_ON(bfq_gt(entry->min_start, vtime)); -+ -+ if (node->rb_left) { -+ entry = rb_entry(node->rb_left, -+ struct bfq_entity, rb_node); -+ if (!bfq_gt(entry->min_start, vtime)) { -+ node = node->rb_left; -+ goto left; -+ } -+ } -+ if (first) -+ break; -+ node = node->rb_right; -+ } -+ -+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); -+ return first; -+} -+ -+/** -+ * __bfq_lookup_next_entity - return the first eligible entity in @st. -+ * @st: the service tree. -+ * -+ * If there is no in-service entity for the sched_data st belongs to, -+ * then return the entity that will be set in service if: -+ * 1) the parent entity this st belongs to is set in service; -+ * 2) no entity belonging to such parent entity undergoes a state change -+ * that would influence the timestamps of the entity (e.g., becomes idle, -+ * becomes backlogged, changes its budget, ...). -+ * -+ * In this first case, update the virtual time in @st too (see the -+ * comments on this update inside the function). -+ * -+ * In constrast, if there is an in-service entity, then return the -+ * entity that would be set in service if not only the above -+ * conditions, but also the next one held true: the currently -+ * in-service entity, on expiration, -+ * 1) gets a finish time equal to the current one, or -+ * 2) is not eligible any more, or -+ * 3) is idle. -+ */ -+static struct bfq_entity * -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) -+{ -+ struct bfq_entity *entity; -+ u64 new_vtime; -+ struct bfq_queue *bfqq; -+ -+ if (RB_EMPTY_ROOT(&st->active)) -+ return NULL; -+ -+ /* -+ * Get the value of the system virtual time for which at -+ * least one entity is eligible. -+ */ -+ new_vtime = bfq_calc_vtime_jump(st); -+ -+ /* -+ * If there is no in-service entity for the sched_data this -+ * active tree belongs to, then push the system virtual time -+ * up to the value that guarantees that at least one entity is -+ * eligible. If, instead, there is an in-service entity, then -+ * do not make any such update, because there is already an -+ * eligible entity, namely the in-service one (even if the -+ * entity is not on st, because it was extracted when set in -+ * service). -+ */ -+ if (!in_service) -+ bfq_update_vtime(st, new_vtime); -+ -+ entity = bfq_first_active_entity(st, new_vtime); -+ BUG_ON(bfq_gt(entity->start, new_vtime)); -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "start %llu vtime %llu (%llu) st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((st->vtime>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+ } -+#endif -+ -+ BUG_ON(!entity); -+ -+ return entity; -+} -+ -+/** -+ * bfq_lookup_next_entity - return the first eligible entity in @sd. -+ * @sd: the sched_data. -+ * @expiration: true if we are on the expiration path of the in-service queue -+ * -+ * This function is invoked when there has been a change in the trees -+ * for sd, and we need to know what is the new next entity to serve -+ * after this change. -+ */ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ int class_idx = 0; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ /* -+ * Choose from idle class, if needed to guarantee a minimum -+ * bandwidth to this class (and if there is some active entity -+ * in idle class). This should also mitigate -+ * priority-inversion problems in case a low priority task is -+ * holding file system resources. -+ */ -+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -+ BFQ_CL_IDLE_TIMEOUT)) { -+ if (!RB_EMPTY_ROOT(&idle_class_st->active)) -+ class_idx = BFQ_IOPRIO_CLASSES - 1; -+ /* About to be served if backlogged, or not yet backlogged */ -+ sd->bfq_class_idle_last_service = jiffies; -+ } -+ -+ /* -+ * Find the next entity to serve for the highest-priority -+ * class, unless the idle class needs to be served. -+ */ -+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ /* -+ * If expiration is true, then bfq_lookup_next_entity -+ * is being invoked as a part of the expiration path -+ * of the in-service queue. In this case, even if -+ * sd->in_service_entity is not NULL, -+ * sd->in_service_entiy at this point is actually not -+ * in service any more, and, if needed, has already -+ * been properly queued or requeued into the right -+ * tree. The reason why sd->in_service_entity is still -+ * not NULL here, even if expiration is true, is that -+ * sd->in_service_entiy is reset as a last step in the -+ * expiration path. So, if expiration is true, tell -+ * __bfq_lookup_next_entity that there is no -+ * sd->in_service_entity. -+ */ -+ entity = __bfq_lookup_next_entity(st + class_idx, -+ sd->in_service_entity && -+ !expiration); -+ -+ if (entity) -+ break; -+ } -+ -+ BUG_ON(!entity && -+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || -+ !RB_EMPTY_ROOT(&(st+2)->active))); -+ -+ if (!entity) -+ return NULL; -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", -+ st + class_idx, class_idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen from st %p %d", -+ st + class_idx, class_idx); -+ } -+#endif -+ -+ return entity; -+} -+ -+static bool next_queue_may_preempt(struct bfq_data *bfqd) -+{ -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ return sd->next_in_service != sd->in_service_entity; -+} -+ -+/* -+ * Get next queue for service. -+ */ -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_sched_data *sd; -+ struct bfq_queue *bfqq; -+ -+ BUG_ON(bfqd->in_service_queue); -+ -+ if (bfq_tot_busy_queues(bfqd) == 0) -+ return NULL; -+ -+ /* -+ * Traverse the path from the root to the leaf entity to -+ * serve. Set in service all the entities visited along the -+ * way. -+ */ -+ sd = &bfqd->root_group->sched_data; -+ for (; sd ; sd = entity->my_sched_data) { -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "lookup in this group"); -+ if (!sd->next_in_service) -+ pr_crit("lookup in this group"); -+ } else { -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "lookup in root group"); -+ if (!sd->next_in_service) -+ pr_crit("lookup in root group"); -+ } -+#endif -+ -+ BUG_ON(!sd->next_in_service); -+ -+ /* -+ * WARNING. We are about to set the in-service entity -+ * to sd->next_in_service, i.e., to the (cached) value -+ * returned by bfq_lookup_next_entity(sd) the last -+ * time it was invoked, i.e., the last time when the -+ * service order in sd changed as a consequence of the -+ * activation or deactivation of an entity. In this -+ * respect, if we execute bfq_lookup_next_entity(sd) -+ * in this very moment, it may, although with low -+ * probability, yield a different entity than that -+ * pointed to by sd->next_in_service. This rare event -+ * happens in case there was no CLASS_IDLE entity to -+ * serve for sd when bfq_lookup_next_entity(sd) was -+ * invoked for the last time, while there is now one -+ * such entity. -+ * -+ * If the above event happens, then the scheduling of -+ * such entity in CLASS_IDLE is postponed until the -+ * service of the sd->next_in_service entity -+ * finishes. In fact, when the latter is expired, -+ * bfq_lookup_next_entity(sd) gets called again, -+ * exactly to update sd->next_in_service. -+ */ -+ -+ /* Make next_in_service entity become in_service_entity */ -+ entity = sd->next_in_service; -+ sd->in_service_entity = entity; -+ -+ /* -+ * If entity is no longer a candidate for next -+ * service, then it must be extracted from its active -+ * tree, so as to make sure that it won't be -+ * considered when computing next_in_service. See the -+ * comments on the function -+ * bfq_no_longer_next_in_service() for details. -+ */ -+ if (bfq_no_longer_next_in_service(entity)) -+ bfq_active_extract(bfq_entity_service_tree(entity), -+ entity); -+ -+ /* -+ * Even if entity is not to be extracted according to -+ * the above check, a descendant entity may get -+ * extracted in one of the next iterations of this -+ * loop. Such an event could cause a change in -+ * next_in_service for the level of the descendant -+ * entity, and thus possibly back to this level. -+ * -+ * However, we cannot perform the resulting needed -+ * update of next_in_service for this level before the -+ * end of the whole loop, because, to know which is -+ * the correct next-to-serve candidate entity for each -+ * level, we need first to find the leaf entity to set -+ * in service. In fact, only after we know which is -+ * the next-to-serve leaf entity, we can discover -+ * whether the parent entity of the leaf entity -+ * becomes the next-to-serve, and so on. -+ */ -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ -+ } -+ -+ BUG_ON(!entity); -+ bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!bfqq); -+ -+ /* -+ * We can finally update all next-to-serve entities along the -+ * path from the leaf entity just set in service to the root. -+ */ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ if (!bfq_update_next_in_service(sd, NULL, false)) -+ break; -+ } -+ -+ return bfqq; -+} -+ -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; -+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; -+ struct bfq_entity *entity = in_serv_entity; -+ -+#ifndef BFQ_MQ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+#endif -+ -+ bfq_clear_bfqq_wait_request(in_serv_bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqd->in_service_queue = NULL; -+ -+ /* -+ * When this function is called, all in-service entities have -+ * been properly deactivated or requeued, so we can safely -+ * execute the final step: reset in_service_entity along the -+ * path from entity to the root. -+ */ -+ for_each_entity(entity) -+ entity->sched_data->in_service_entity = NULL; -+ -+ /* -+ * in_serv_entity is no longer in service, so, if it is in no -+ * service tree either, then release the service reference to -+ * the queue it represents (taken with bfq_get_entity). -+ */ -+ if (!in_serv_entity->on_st) -+ bfq_put_queue(in_serv_bfqq); -+} -+ -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool ins_into_idle_tree, bool expiration) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -+} -+ -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && -+ entity->on_st); -+ -+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -+ false, false); -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_requeue_entity(entity, false, -+ bfqq == bfqd->in_service_queue, expiration); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -+ -+/* -+ * Called when the bfqq no longer has requests pending, remove it from -+ * the service tree. As a special case, it can be invoked during an -+ * expiration. -+ */ -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) -+{ -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); -+ -+ bfq_clear_bfqq_busy(bfqq); -+ -+ BUG_ON(bfq_tot_busy_queues(bfqd) == 0); -+ bfqd->busy_queues[bfqq->ioprio_class - 1]--; -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); -+ if (!bfqq->dispatched) -+ bfq_weights_tree_remove(bfqd, bfqq); -+} -+ -+/* -+ * Called when an inactive queue receives a new request. -+ */ -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); -+ -+ bfq_activate_bfqq(bfqd, bfqq); -+ -+ bfq_mark_bfqq_busy(bfqq); -+ bfqd->busy_queues[bfqq->ioprio_class - 1]++; -+ -+ if (!bfqq->dispatched) -+ if (bfqq->wr_coeff == 1) -+ bfq_weights_tree_add(bfqd, bfqq, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ } -+ -+} -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -new file mode 100644 -index 000000000000..6da94eef0cf1 ---- /dev/null -+++ b/block/bfq-sq-iosched.c -@@ -0,0 +1,5957 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * In particular, BFQ schedules I/O so as to achieve the latter goal-- -+ * low latency for interactive and soft real-time applications--if the -+ * low_latency parameter is set (default configuration). To this -+ * purpose, BFQ constantly tries to detect whether the I/O requests in -+ * a bfq_queue come from an interactive or a soft real-time -+ * application. For brevity, in these cases, the queue is said to be -+ * interactive or soft real-time. In both cases, BFQ privileges the -+ * service of the queue, over that of non-interactive and -+ * non-soft-real-time queues. This privileging is performed, mainly, -+ * by raising the weight of the queue. So, for brevity, we call just -+ * weight-raising periods the time periods during which a queue is -+ * privileged, because deemed interactive or soft real-time. -+ * -+ * The detection of soft real-time queues/applications is described in -+ * detail in the comments on the function -+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an -+ * interactive queue works as follows: a queue is deemed interactive -+ * if it is constantly non empty only for a limited time interval, -+ * after which it does become empty. The queue may be deemed -+ * interactive again (for a limited time), if it restarts being -+ * constantly non empty, provided that this happens only after the -+ * queue has remained empty for a given minimum idle time. -+ * -+ * By default, BFQ computes automatically the above maximum time -+ * interval, i.e., the time interval after which a constantly -+ * non-empty queue stops being deemed interactive. Since a queue is -+ * weight-raised while it is deemed interactive, this maximum time -+ * interval happens to coincide with the (maximum) duration of the -+ * weight-raising for interactive queues. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, -+ * more theoretical paper on BFQ can be found. The interested reader -+ * can find in the latter paper full details on the main algorithm, as -+ * well as formulas of the guarantees and formal proofs of all the -+ * properties. With respect to the version of BFQ presented in these -+ * papers, this implementation adds a few more heuristics, such as the -+ * one that guarantees a low latency to soft real-time applications, -+ * and a hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include "blk.h" -+#include "bfq.h" -+#include "blk-wbt.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * When a sync request is dispatched, the queue that contains that -+ * request, and all the ancestor entities of that queue, are charged -+ * with the number of sectors of the request. In constrast, if the -+ * request is async, then the queue and its ancestor entities are -+ * charged with the number of sectors of the request, multiplied by -+ * the factor below. This throttles the bandwidth for async I/O, -+ * w.r.t. to sync I/O, and it is done to counter the tendency of async -+ * writes to steal I/O throughput to reads. -+ * -+ * The current value of this parameter is the result of a tuning with -+ * several hardware and software configurations. We tried to find the -+ * lowest value for which writes do not cause noticeable problems to -+ * reads. In fact, the lower this parameter, the stabler I/O control, -+ * in the following respect. The lower this parameter is, the less -+ * the bandwidth enjoyed by a group decreases -+ * - when the group does writes, w.r.t. to when it does reads; -+ * - when other groups do reads, w.r.t. to when they do writes. -+ */ -+static const int bfq_async_charge_factor = 3; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 3 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ -+ (get_sdist(last_pos, rq) > \ -+ BFQQ_SEEK_THR && \ -+ (!blk_queue_nonrot(bfqd->queue) || \ -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * When configured for computing the duration of the weight-raising -+ * for interactive queues automatically (see the comments at the -+ * beginning of this file), BFQ does it using the following formula: -+ * duration = (ref_rate / r) * ref_wr_duration, -+ * where r is the peak rate of the device, and ref_rate and -+ * ref_wr_duration are two reference parameters. In particular, -+ * ref_rate is the peak rate of the reference storage device (see -+ * below), and ref_wr_duration is about the maximum time needed, with -+ * BFQ and while reading two files in parallel, to load typical large -+ * applications on the reference device (see the comments on -+ * max_service_from_wr below, for more details on how ref_wr_duration -+ * is obtained). In practice, the slower/faster the device at hand -+ * is, the more/less it takes to load applications with respect to the -+ * reference device. Accordingly, the longer/shorter BFQ grants -+ * weight raising to interactive applications. -+ * -+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), -+ * depending on whether the device is rotational or non-rotational. -+ * -+ * In the following definitions, ref_rate[0] and ref_wr_duration[0] -+ * are the reference values for a rotational device, whereas -+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a -+ * non-rotational device. The reference rates are not the actual peak -+ * rates of the devices used as a reference, but slightly lower -+ * values. The reason for using slightly lower values is that the -+ * peak-rate estimator tends to yield slightly lower values than the -+ * actual peak rate (it can yield the actual peak rate only if there -+ * is only one process doing I/O, and the process does sequential -+ * I/O). -+ * -+ * The reference peak rates are measured in sectors/usec, left-shifted -+ * by BFQ_RATE_SHIFT. -+ */ -+static int ref_rate[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize -+ * the following array, which entails that the array can be -+ * initialized only in a function. -+ */ -+static int ref_wr_duration[2]; -+ -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transfered. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, ""); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "%llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_better_to_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 4) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly -+ * the last two symmetry sub-conditions above would be quite complex -+ * and time consuming. Therefore this function evaluates, instead, -+ * only the following stronger three sub-conditions, for which it is -+ * much easier to maintain the needed state: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) there are no active groups. -+ * In particular, the last condition is always true if hierarchical -+ * support or the cgroups interface are not enabled, thus no state -+ * needs to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ /* -+ * For queue weights to differ, queue_weights_tree must contain -+ * at least two nodes. -+ */ -+ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right); -+ -+ bool multiple_classes_busy = -+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || -+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || -+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); -+ -+ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", -+ varied_queue_weights, multiple_classes_busy); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log(bfqd, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+#endif -+ -+ return !(varied_queue_weights || multiple_classes_busy -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ || bfqd->num_groups_with_pending_reqs > 0 -+#endif -+ ); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input queue, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the queue is already associated with a -+ * counter, which happens if: -+ * 1) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 2) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (bfqq->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ bfqq->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of queue to not be -+ * considered in bfq_symmetric_scenario, which, in its turn, -+ * causes the scenario to be deemed wrongly symmetric in case -+ * bfqq's weight would have been the only weight making the -+ * scenario asymmetric. On the bright side, no unbalance will -+ * however occur when bfqq becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of queue). In fact, bfq_weights_tree_remove does nothing -+ * if !bfqq->weight_counter. -+ */ -+ if (unlikely(!bfqq->weight_counter)) -+ return; -+ -+ bfqq->weight_counter->weight = entity->weight; -+ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); -+ rb_insert_color(&bfqq->weight_counter->weights_node, root); -+ -+inc_counter: -+ bfqq->weight_counter->num_active++; -+ bfqq->ref++; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+} -+ -+/* -+ * Decrement the weight counter associated with the queue, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void __bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (!bfqq->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(bfqq->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!bfqq->weight_counter->num_active); -+ bfqq->weight_counter->num_active--; -+ -+ if (bfqq->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&bfqq->weight_counter->weights_node, root); -+ kfree(bfqq->weight_counter); -+ -+reset_entity_pointer: -+ bfqq->weight_counter = NULL; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+ bfq_put_queue(bfqq); -+} -+ -+/* -+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number -+ * of active groups for each queue's inactive parent entity. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = bfqq->entity.parent; -+ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->my_sched_data; -+ -+ BUG_ON(entity->sched_data == NULL); /* -+ * It would mean -+ * that this is -+ * the root group. -+ */ -+ -+ if (sd->next_in_service || sd->in_service_entity) { -+ BUG_ON(!entity->in_groups_with_pending_reqs); -+ /* -+ * entity is still active, because either -+ * next_in_service or in_service_entity is not -+ * NULL (see the comments on the definition of -+ * next_in_service for details on why -+ * in_service_entity must be checked too). -+ * -+ * As a consequence, its parent entities are -+ * active as well, and thus this loop must -+ * stop here. -+ */ -+ break; -+ } -+ -+ BUG_ON(!bfqd->num_groups_with_pending_reqs && -+ entity->in_groups_with_pending_reqs); -+ /* -+ * The decrement of num_groups_with_pending_reqs is -+ * not performed immediately upon the deactivation of -+ * entity, but it is delayed to when it also happens -+ * that the first leaf descendant bfqq of entity gets -+ * all its pending requests completed. The following -+ * instructions perform this delayed decrement, if -+ * needed. See the comments on -+ * num_groups_with_pending_reqs for details. -+ */ -+ if (entity->in_groups_with_pending_reqs) { -+ entity->in_groups_with_pending_reqs = false; -+ bfqd->num_groups_with_pending_reqs--; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+ } -+ -+ /* -+ * Next function is invoked last, because it causes bfqq to be -+ * freed if the following holds: bfqq is not in service and -+ * has no dispatched request. DO NOT use bfqq after the next -+ * function invocation. -+ */ -+ __bfq_weights_tree_remove(bfqd, bfqq, -+ &bfqd->queue_weights_tree); -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqq->bfqd)) -+ return blk_rq_sectors(rq); -+ -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, -+ max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)), -+ entity->service); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq, false); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->rate_dur_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 25 seconds. The upper limit -+ * has been conservatively set after the following worst case: -+ * on a QEMU/KVM virtual machine -+ * - running in a slow PC -+ * - with a virtual disk stacked on a slow low-end 5400rpm HDD -+ * - serving a heavy I/O workload, such as the sequential reading -+ * of several files -+ * mplayer took 23 seconds to start, if constantly weight-raised. -+ * -+ * As for higher values than that accomodating the above bad -+ * scenario, tests show that higher values would often yield -+ * the opposite of the desired result, i.e., would worsen -+ * responsiveness by allowing non-interactive applications to -+ * preserve weight raising for too long. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); -+} -+ -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ } -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - -+ (bfqq->weight_counter != NULL); -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (entity->budget < entity->service) { -+ pr_crit("budget %d service %d\n", -+ entity->budget, entity->service); -+ BUG(); -+ } -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ /* -+ * In the next compound condition, we check also whether there -+ * is some budget left, because otherwise there is no point in -+ * trying to go on serving bfqq with this same budget: bfqq -+ * would be expired immediately after being selected for -+ * service. This would only cause useless overhead. -+ */ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && -+ bfq_bfqq_budget_left(bfqq) > 0) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ -+ /* -+ * At this point, we have used entity->service to get -+ * the budget left (needed for updating -+ * entity->budget). Thus we finally can, and have to, -+ * reset entity->service. The latter must be reset -+ * because bfqq would otherwise be charged again for -+ * the service it has received during its previous -+ * service slot(s). -+ */ -+ entity->service = 0; -+ -+ return true; -+ } -+ -+ /* -+ * We can finally complete expiration, by setting service to 0. -+ */ -+ entity->service = 0; -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->service_from_wr = 0; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start) && -+ bfqq->dispatched == 0; -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ /* -+ * NOTE: -+ * (bfqq->entity.service > bfqq->entity.budget) may hold here, -+ * in case of forced dispatches. -+ */ -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); -+ return false; -+ } -+ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfq_tot_busy_queues(bfqd) == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > -+ bfq_tot_busy_queues(bfqd)); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "cur-budget = %d prio_class %d", -+ bfqq->entity.budget, bfqq->ioprio_class); -+ } else -+ bfq_log(bfqd, "NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on the ref_wr_duration array. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ if (RQ_BFQQ(rq) == bfqd->in_service_queue) -+ bfqd->in_serv_last_pos = bfqd->last_position; -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq, true); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. -+ * -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) -+{ -+ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ blk_queue_nonrot(bfqq->bfqd->queue) && -+ bfqq->bfqd->hw_tag; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. And we do it, unless bfqq is in -+ * interactive weight raising. We do not do it in the -+ * latter subcase, for the following reason. bfqq may -+ * be conveying the I/O needed to load a soft -+ * real-time application. Such an application will -+ * actually exhibit a soft real-time I/O pattern after -+ * it finally starts doing its job. But, if -+ * soft_rt_next_start is computed here for an -+ * interactive bfqq, and bfqq had received a lot of -+ * service before remaining with no outstanding -+ * request (likely to happen on a fast device), then -+ * soft_rt_next_start would be assigned such a high -+ * value that, for a very long time, bfqq would be -+ * prevented from being possibly considered as soft -+ * real time. -+ * -+ * If, instead, the queue still has outstanding -+ * requests, then we have to wait for the completion -+ * of all the outstanding requests to discover whether -+ * the request pattern is actually isochronous. -+ */ -+ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); -+ if (bfqq->dispatched == 0 && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else if (bfqq->dispatched > 0) { -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", -+ reason_name[reason], slow, bfqq->dispatched, -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight, -+ entity->service, entity->budget); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ if (ref == 1) /* bfqq is gone, no more actions on it */ -+ return; -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ bfqq->injected_service = 0; -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (!bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(bfqq->next_rq); -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+ /* -+ * Not setting service to 0, because, if the next rq -+ * arrives in time, the queue will go on receiving -+ * service with this same budget (as if it never expired) -+ */ -+ } else { -+ entity->service = 0; -+ bfq_log_bfqq(bfqd, bfqq, "resetting service"); -+ } -+ -+ /* -+ * Reset the received-service counter for every parent entity. -+ * Differently from what happens with bfqq->entity.service, -+ * the resetting of this counter never needs to be postponed -+ * for parent entities. In fact, in case bfqq may have a -+ * chance to go on being served using the last, partially -+ * consumed budget, bfqq->entity.service needs to be kept, -+ * because if bfqq then actually goes on being served using -+ * the same budget, the last value of bfqq->entity.service is -+ * needed to properly decrement bfqq->entity.budget by the -+ * portion already consumed. In contrast, it is not necessary -+ * to keep entity->service for parent entities too, because -+ * the bubble up of the new value of bfqq->entity.budget will -+ * make sure that the budgets of parent entities are correct, -+ * even in case bfqq and thus parent entities go on receiving -+ * service with the same budget. -+ */ -+ entity = entity->parent; -+ for_each_entity(entity) -+ entity->service = 0; -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. -+ */ -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); -+ -+ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); -+ -+ /* -+ * The return value of this function is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the return value if -+ * there are weight-raised busy queues. In this case, and if -+ * bfqq is not weight-raised, this guarantees that the device -+ * is not idled for bfqq (if, instead, bfqq is weight-raised, -+ * then idling will be guaranteed by another variable, see -+ * below). Combined with the timestamping rules of BFQ (see -+ * [1] for details), this behavior causes bfqq, and hence any -+ * sync non-weight-raised queue, to get a lower number of -+ * requests served, and thus to ask for a lower number of -+ * requests from the request pool, before the busy -+ * weight-raised queues get served again. This often mitigates -+ * starvation problems in the presence of heavy write -+ * workloads and NCQ, thereby guaranteeing a higher -+ * application and system responsiveness in these hostile -+ * scenarios. -+ */ -+ return idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+} -+ -+/* -+ * There is a case where idling must be performed not for -+ * throughput concerns, but to preserve service guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) the I/O of each process has the same properties, in -+ * terms of locality (sequential or random), direction -+ * (reads or writes), request sizes, greediness -+ * (from I/O-bound to sporadic), and so on. -+ * In fact, in such a scenario, the drive tends to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * The problem is that idling may significantly reduce -+ * throughput with certain combinations of types of I/O and -+ * devices. An important example is sync random I/O, on flash -+ * storage with command queueing. So, unless bfqq falls in the -+ * above cases where idling also boosts throughput, it would -+ * be important to check conditions (i) and (ii) accurately, -+ * so as to avoid idling when not strictly needed for service -+ * guarantees. -+ * -+ * Unfortunately, it is extremely difficult to thoroughly -+ * check condition (ii). And, in case there are active groups, -+ * it becomes very difficult to check condition (i) too. In -+ * fact, if there are active groups, then, for condition (i) -+ * to become false, it is enough that an active group contains -+ * more active processes or sub-groups than some other active -+ * group. More precisely, for condition (i) to hold because of -+ * such a group, it is not even necessary that the group is -+ * (still) active: it is sufficient that, even if the group -+ * has become inactive, some of its descendant processes still -+ * have some request already dispatched but still waiting for -+ * completion. In fact, requests have still to be guaranteed -+ * their share of the throughput even after being -+ * dispatched. In this respect, it is easy to show that, if a -+ * group frequently becomes inactive while still having -+ * in-flight requests, and if, when this happens, the group is -+ * not considered in the calculation of whether the scenario -+ * is asymmetric, then the group may fail to be guaranteed its -+ * fair share of the throughput (basically because idling may -+ * not be performed for the descendant processes of the group, -+ * but it had to be). We address this issue with the -+ * following bi-modal behavior, implemented in the function -+ * bfq_symmetric_scenario(). -+ * -+ * If there are groups with requests waiting for completion -+ * (as commented above, some of these groups may even be -+ * already inactive), then the scenario is tagged as -+ * asymmetric, conservatively, without checking any of the -+ * conditions (i) and (ii). So the device is idled for bfqq. -+ * This behavior matches also the fact that groups are created -+ * exactly if controlling I/O is a primary concern (to -+ * preserve bandwidth and latency guarantees). -+ * -+ * On the opposite end, if there are no groups with requests -+ * waiting for completion, then only condition (i) is actually -+ * controlled, i.e., provided that condition (i) holds, idling -+ * is not performed, regardless of whether condition (ii) -+ * holds. In other words, only if condition (i) does not hold, -+ * then idling is allowed, and the device tends to be -+ * prevented from queueing many requests, possibly of several -+ * processes. Since there are no groups with requests waiting -+ * for completion, then, to control condition (i) it is enough -+ * to check just whether all the queues with requests waiting -+ * for completion also have the same weight. -+ * -+ * Not checking condition (ii) evidently exposes bfqq to the -+ * risk of getting less throughput than its fair share. -+ * However, for queues with the same weight, a further -+ * mechanism, preemption, mitigates or even eliminates this -+ * problem. And it does so without consequences on overall -+ * throughput. This mechanism and its benefits are explained -+ * in the next three paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * The motivation for using preemption instead of idling (for -+ * queues with the same weight) is that, by not idling, -+ * service guarantees are preserved (completely or at least in -+ * part) without minimally sacrificing throughput. And, if -+ * there is no active group, then the primary expectation for -+ * this device is probably a high throughput. -+ * -+ * We are now left only with explaining the additional -+ * compound condition that is checked below for deciding -+ * whether the scenario is asymmetric. To explain this -+ * compound condition, we need to add that the function -+ * bfq_symmetric_scenario checks the weights of only -+ * non-weight-raised queues, for efficiency reasons (see -+ * comments on bfq_weights_tree_add()). Then the fact that -+ * bfqq is weight-raised is checked explicitly here. More -+ * precisely, the compound condition below takes into account -+ * also the fact that, even if bfqq is being weight-raised, -+ * the scenario is still symmetric if all queues with requests -+ * waiting for completion happen to be -+ * weight-raised. Actually, we should be even more precise -+ * here, and differentiate between interactive weight raising -+ * and soft real-time weight raising. -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && -+ bfqd->wr_busy_queues < -+ bfq_tot_busy_queues(bfqd)) || -+ !bfq_symmetric_scenario(bfqd); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_coeff %d wr_busy %d busy %d asymmetric %d", -+ bfqq->wr_coeff, -+ bfqd->wr_busy_queues, -+ bfq_tot_busy_queues(bfqd), -+ asymmetric_scenario); -+ -+ return asymmetric_scenario; -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * Most of the issues taken into account to get the return value of -+ * this function are not trivial. We discuss these issues in the two -+ * functions providing the main pieces of information needed by this -+ * function. -+ */ -+static bool bfq_better_to_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; -+ -+ if (unlikely(bfqd->strict_guarantees)) -+ return true; -+ -+ /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ idling_boosts_thr_with_no_issue = -+ idling_boosts_thr_without_issues(bfqd, bfqq); -+ -+ idling_needed_for_service_guar = -+ idling_needed_for_service_guarantees(bfqd, bfqq); -+ -+ /* -+ * We have now the two components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_with_no_issue, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guar); -+ -+ return idling_boosts_thr_with_no_issue || -+ idling_needed_for_service_guar; -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_better_to_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_better_to_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_better_to_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); -+} -+ -+static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * A linear search; but, with a high probability, very few -+ * steps are needed to find a candidate queue, i.e., a queue -+ * with enough budget left for its next request. In fact: -+ * - BFQ dynamically updates the budget of every queue so as -+ * to accomodate the expected backlog of the queue; -+ * - if a queue gets all its requests dispatched as injected -+ * service, then the queue is removed from the active list -+ * (and re-added only if it gets new requests, but with -+ * enough budget for its new backlog). -+ */ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= -+ bfq_bfqq_budget_left(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ return bfqq; -+ } -+ -+ bfq_log(bfqd, "no queue found"); -+ return NULL; -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); -+ -+ /* -+ * Do not expire bfqq for budget timeout if bfqq may be about -+ * to enjoy device idling. The reason why, in this case, we -+ * prevent bfqq from expiring is the same as in the comments -+ * on the case where bfq_bfqq_must_idle() returns true, in -+ * bfq_completed_request(). -+ */ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ * -+ * Yet, to boost throughput, inject service from other queues if -+ * possible. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { -+ if (bfq_bfqq_injectable(bfqq) && -+ bfqq->injected_service * bfqq->inject_coeff < -+ bfqq->entity.service * 10) { -+ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); -+ bfqq = bfq_choose_bfqq_for_injection(bfqd); -+ } else { -+ if (BFQQ_SEEKY(bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "injection saturated %d * %d >= %d * 10", -+ bfqq->injected_service, bfqq->inject_coeff, -+ bfqq->entity.service); -+ bfqq = NULL; -+ } -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ else -+ bfq_log(bfqd, "no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "too much service"); -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->dispatched); -+ -+ dispatched++; -+ -+ if (bfqq != bfqd->in_service_queue) { -+ if (likely(bfqd->in_service_queue)) { -+ bfqd->in_service_queue->injected_service += -+ bfq_serv_to_charge(rq, bfqq); -+ bfq_log_bfqq(bfqd, bfqd->in_service_queue, -+ "injected_service increased to %d", -+ bfqd->in_service_queue->injected_service); -+ } -+ return dispatched; -+ } -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ BUG_ON(!bfqd->in_service_bic); -+ } -+ -+ if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfq_tot_busy_queues(bfqd) != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); -+ -+ if (bfq_tot_busy_queues(bfqd) == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { -+ hlist_del_init(&bfqq->burst_list_node); -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; -+ } -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ -+ if (!bfq_class_idle(bfqq)) -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ /* -+ * Aggressively inject a lot of service: up to 90%. -+ * This coefficient remains constant during bfqq life, -+ * but this behavior might be changed, after enough -+ * testing and tuning. -+ */ -+ bfqq->inject_coeff = 1; -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. -+ */ -+ bfqq->soft_rt_next_start = jiffies; -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); -+} -+ -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ bool has_short_ttime = true; -+ -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", -+ has_short_ttime); -+ -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if -+ * - the request is small, and -+ * - we are idling to boost throughput, and -+ * - the queue is not to be expired, -+ * then just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. In contrast -+ * we wait for the block layer to decide when to -+ * unplug the device: hopefully, new requests will be -+ * merged to this one quickly, then the device will be -+ * unplugged and larger requests will be dispatched. -+ */ -+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && -+ !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or idling is being -+ * performed to preserve service guarantees, or -+ * finally the queue is to be expired: in all these -+ * cases disk idling is to be stopped, so clear -+ * wait_request flag and reset timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ /* -+ * If active queue hasn't enough requests and can idle, bfq might not -+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this -+ * case -+ */ -+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && -+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < -+ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq->start_time_ns, -+ rq->io_start_time_ns, -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, bfqq); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * do not compute soft_rt_next_start if bfqq is in interactive -+ * weight raising (see the comments in bfq_bfqq_expire() for -+ * an explanation). We schedule this delayed update when bfqq -+ * expires, if it still has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfq_bfqq_must_idle(bfqq)) { -+ if (bfqq->dispatched == 0) -+ bfq_arm_slice_timer(bfqd); -+ /* -+ * If we get here, we do not expire bfqq, even -+ * if bfqq was in budget timeout or had no -+ * more requests (as controlled in the next -+ * conditional instructions). The reason for -+ * not expiring bfqq is as follows. -+ * -+ * Here bfqq->dispatched > 0 holds, but -+ * bfq_bfqq_must_idle() returned true. This -+ * implies that, even if no request arrives -+ * for bfqq before bfqq->dispatched reaches 0, -+ * bfqq will, however, not be expired on the -+ * completion event that causes bfqq->dispatch -+ * to reach zero. In contrast, on this event, -+ * bfqq will start enjoying device idling -+ * (I/O-dispatch plugging). -+ * -+ * But, if we expired bfqq here, bfqq would -+ * not have the chance to enjoy device idling -+ * when bfqq->dispatched finally reaches -+ * zero. This would expose bfqq to violation -+ * of its reserved service guarantees. -+ */ -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_better_to_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "%p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* release oom-queue reference to root group */ -+ bfqg_put(bfqd->root_group); -+ -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->num_groups_with_pending_reqs = 0; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device peak -+ * rate is equal to 2/3 of the highest reference rate. -+ */ -+ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * -+ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_registered_queue(struct request_queue *q) -+{ -+ wbt_disable_default(q); -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ .elevator_registered_fn = bfq_registered_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v9"; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definition of the next -+ * array). Actually, we use slightly lower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto slab_kill; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+slab_kill: -+ bfq_slab_kill(); -+err_pol_unreg: -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); -diff --git a/block/bfq.h b/block/bfq.h -new file mode 100644 -index 000000000000..0177fc7205d7 ---- /dev/null -+++ b/block/bfq.h -@@ -0,0 +1,1074 @@ -+/* -+ * BFQ v9: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue in a hierarchical setup. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active queues -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the queues this counter refers to */ -+ unsigned int num_active; /* nr of active queues with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+ -+ /* flag, set if the entity is counted in groups_with_pending_reqs */ -+ bool in_groups_with_pending_reqs; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* pointer to the weight counter associated with this queue */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ -+ -+ /* max service rate measured so far */ -+ u32 max_service_rate; -+ /* -+ * Ratio between the service received by bfqq while it is in -+ * service, and the cumulative service (of requests of other -+ * queues) that may be injected while bfqq is empty but still -+ * in service. To increase precision, the coefficient is -+ * measured in tenths of unit. Here are some example of (1) -+ * ratios, (2) resulting percentages of service injected -+ * w.r.t. to the total service dispatched while bfqq is in -+ * service, and (3) corresponding values of the coefficient: -+ * 1 (50%) -> 10 -+ * 2 (33%) -> 20 -+ * 10 (9%) -> 100 -+ * 9.9 (9%) -> 99 -+ * 1.5 (40%) -> 15 -+ * 0.5 (66%) -> 5 -+ * 0.1 (90%) -> 1 -+ * -+ * So, if the coefficient is lower than 10, then -+ * injected service is more than bfqq service. -+ */ -+ unsigned int inject_coeff; -+ /* amount of service injected in current service slot */ -+ unsigned int injected_service; -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. -+ */ -+ bool saved_has_short_ttime; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ -+ /* -+ * Number of groups with at least one descendant process that -+ * has at least one request waiting for completion. Note that -+ * this accounts for also requests already dispatched, but not -+ * yet completed. Therefore this number of groups may differ -+ * (be larger) than the number of active groups, as a group is -+ * considered active only if its corresponding entity has -+ * descendant queues with at least one request queued. This -+ * number is used to decide whether a scenario is symmetric. -+ * For a detailed explanation see comments on the computation -+ * of the variable asymmetric_scenario in the function -+ * bfq_better_to_idle(). -+ * -+ * However, it is hard to compute this number exactly, for -+ * groups with multiple descendant processes. Consider a group -+ * that is inactive, i.e., that has no descendant process with -+ * pending I/O inside BFQ queues. Then suppose that -+ * num_groups_with_pending_reqs is still accounting for this -+ * group, because the group has descendant processes with some -+ * I/O request still in flight. num_groups_with_pending_reqs -+ * should be decremented when the in-flight request of the -+ * last descendant process is finally completed (assuming that -+ * nothing else has changed for the group in the meantime, in -+ * terms of composition of the group and active/inactive state of child -+ * groups and processes). To accomplish this, an additional -+ * pending-request counter must be added to entities, and must -+ * be updated correctly. To avoid this additional field and operations, -+ * we resort to the following tradeoff between simplicity and -+ * accuracy: for an inactive group that is still counted in -+ * num_groups_with_pending_reqs, we decrement -+ * num_groups_with_pending_reqs when the first descendant -+ * process of the group remains with no request waiting for -+ * completion. -+ * -+ * Even this simpler decrement strategy requires a little -+ * carefulness: to avoid multiple decrements, we flag a group, -+ * more precisely an entity representing a group, as still -+ * counted in num_groups_with_pending_reqs when it becomes -+ * inactive. Then, when the first descendant queue of the -+ * entity remains with no request waiting for completion, -+ * num_groups_with_pending_reqs is decremented, and this flag -+ * is reset. After this flag is reset for the entity, -+ * num_groups_with_pending_reqs won't be decremented any -+ * longer in case a new descendant queue of the entity remains -+ * with no request waiting for completion. -+ */ -+ unsigned int num_groups_with_pending_reqs; -+ -+ /* -+ * Per-class (RT, BE, IDLE) number of bfq_queues containing -+ * requests (including the queue in service, even if it is -+ * idling). -+ */ -+ unsigned int busy_queues[3]; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* position of the last served request for the in-service queue */ -+ sector_t in_serv_last_pos; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product ref_rate*ref_wr_duration, used -+ * for computing the maximum duration of weight raising -+ * automatically. -+ */ -+ u64 rate_dur_prod; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(has_short_ttime); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, __func__, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __func__, ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ -+ __func__, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) -+{ -+ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + -+ bfqd->busy_queues[2]; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ -diff --git a/block/blk-mq.c b/block/blk-mq.c -index e3c39ea8e17b..7a57368841f6 100644 ---- a/block/blk-mq.c -+++ b/block/blk-mq.c -@@ -2878,6 +2878,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) - } - if (ret) - break; -+ if (q->elevator && q->elevator->type->ops.mq.depth_updated) -+ q->elevator->type->ops.mq.depth_updated(hctx); - } - - if (!ret) -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 6980014357d4..8c4568ea6884 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 5 -+#define BLKCG_MAX_POLS 7 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - -@@ -127,6 +127,10 @@ typedef __u32 __bitwise req_flags_t; - #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) - /* ->timeout has been called, don't expire again */ - #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) -+/* DEBUG: rq in bfq-mq dispatch list */ -+#define RQF_DISP_LIST ((__force req_flags_t)(1 << 22)) -+/* DEBUG: rq had get_rq_private executed on it */ -+#define RQF_GOT ((__force req_flags_t)(1 << 23)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ -diff --git a/include/linux/elevator.h b/include/linux/elevator.h -index a02deea30185..a2bf4a6b9316 100644 ---- a/include/linux/elevator.h -+++ b/include/linux/elevator.h -@@ -99,6 +99,7 @@ struct elevator_mq_ops { - void (*exit_sched)(struct elevator_queue *); - int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); - void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); -+ void (*depth_updated)(struct blk_mq_hw_ctx *); - - bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); - bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); diff --git a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.95.ebuild b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.95-r1.ebuild index 318e5775..a2808cc6 100644 --- a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.95.ebuild +++ b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.95-r1.ebuild @@ -5,7 +5,7 @@ EAPI=6 inherit eutils -EXTRAVERSION="redcore-lts" +EXTRAVERSION="redcore-lts-r1" KV_FULL="${PV}-${EXTRAVERSION}" KV_MAJOR="4.14" @@ -15,7 +15,7 @@ SRC_URI="https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-${PV}.tar.xz" KEYWORDS="amd64" LICENSE="GPL-2" -SLOT="${PV}" +SLOT="${PVR}" IUSE="+cryptsetup +dmraid +dracut +dkms +mdadm" RESTRICT="binchecks strip mirror" @@ -28,7 +28,7 @@ DEPEND=" cryptsetup? ( sys-fs/cryptsetup ) dmraid? ( sys-fs/dmraid ) dracut? ( >=sys-kernel/dracut-0.44-r8 ) - dkms? ( sys-kernel/dkms ~sys-kernel/linux-sources-redcore-lts-${PV} ) + dkms? ( sys-kernel/dkms ~sys-kernel/linux-sources-redcore-lts-${PVR} ) mdadm? ( sys-fs/mdadm ) >=sys-kernel/linux-firmware-20180314" RDEPEND="${DEPEND}" @@ -58,8 +58,6 @@ PATCHES=( "${FILESDIR}"/"${KV_MAJOR}"-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch "${FILESDIR}"/"${KV_MAJOR}"-0016-unfuck-MuQSS-on-linux-4_14_15+.patch "${FILESDIR}"/"${KV_MAJOR}"-0017-unfuck-MuQSS-on-linux-4_14_75+.patch - "${FILESDIR}"/"${KV_MAJOR}"-0001-BFQ-v8r12-20171108.patch - "${FILESDIR}"/"${KV_MAJOR}"-0002-BFQ-v8r12-20180404.patch ) S="${WORKDIR}"/linux-"${PV}" @@ -76,7 +74,7 @@ src_prepare() { default emake mrproper sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile - cp "${FILESDIR}"/"${KV_MAJOR}"-"${EXTRAVERSION}"-amd64.config .config + cp "${FILESDIR}"/"${KV_MAJOR}"-amd64.config .config rm -rf $(find . -type f|grep -F \.orig) } diff --git a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.19.20.ebuild b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.19.20-r1.ebuild index 1d2bace9..cc0d4eb1 100644 --- a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.19.20.ebuild +++ b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.19.20-r1.ebuild @@ -5,7 +5,7 @@ EAPI=6 inherit eutils -EXTRAVERSION="redcore-lts" +EXTRAVERSION="redcore-lts-r1" KV_FULL="${PV}-${EXTRAVERSION}" KV_MAJOR="4.19" @@ -15,7 +15,7 @@ SRC_URI="https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-${PV}.tar.xz" KEYWORDS="amd64" LICENSE="GPL-2" -SLOT="${PV}" +SLOT="${PVR}" IUSE="+cryptsetup +dmraid +dracut +dkms +mdadm" RESTRICT="binchecks strip mirror" @@ -28,7 +28,7 @@ DEPEND=" cryptsetup? ( sys-fs/cryptsetup ) dmraid? ( sys-fs/dmraid ) dracut? ( >=sys-kernel/dracut-0.44-r8 ) - dkms? ( sys-kernel/dkms ~sys-kernel/linux-sources-redcore-lts-${PV} ) + dkms? ( sys-kernel/dkms ~sys-kernel/linux-sources-redcore-lts-${PVR} ) mdadm? ( sys-fs/mdadm ) >=sys-kernel/linux-firmware-20180314" RDEPEND="${DEPEND}" @@ -44,7 +44,6 @@ PATCHES=( "${FILESDIR}"/"${KV_MAJOR}"-revert-patches-causing-instant-reboot.patch "${FILESDIR}"/"${KV_MAJOR}"-linux-hardened.patch "${FILESDIR}"/"${KV_MAJOR}"-uksm-linux-hardened.patch - "${FILESDIR}"/"${KV_MAJOR}"-bfq-sq-mq-v9r1-2K190204-rc1.patch "${FILESDIR}"/"${KV_MAJOR}"-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch "${FILESDIR}"/"${KV_MAJOR}"-0002-Fix-Werror-build-failure-in-tools.patch "${FILESDIR}"/"${KV_MAJOR}"-0003-Make-preemptible-kernel-default.patch @@ -76,7 +75,7 @@ src_prepare() { default emake mrproper sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile - cp "${FILESDIR}"/"${KV_MAJOR}"-"${EXTRAVERSION}"-amd64.config .config + cp "${FILESDIR}"/"${KV_MAJOR}"-amd64.config .config rm -rf $(find . -type f|grep -F \.orig) } diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch deleted file mode 100644 index db7d064b..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch +++ /dev/null @@ -1,25199 +0,0 @@ -From c21f53f17430230dab50df29b8ea1b71f99d09d6 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@unimore.it> -Date: Tue, 7 Apr 2015 13:39:12 +0200 -Subject: [PATCH 01/51] Add BFQ-v8r12 - -This commit is the result of the following operations. - -1. The squash of all the commits between "block: cgroups, kconfig, -build bits for BFQ-v7r11-4.5.0" and BFQ-v8r12 in the branch -bfq-mq-v8-v4.11 - -2. The renaming of two files (block/bfq-cgroup.c -> -block/bfq-cgroup-included.c and block/bfq-iosched.c -> -block/bfq-sq-iosched.c) and of one option (CONFIG_BFQ_GROUP_IOSCHED -> -CONFIG_BFQ_SQ_GROUP_IOSCHED), to avoid name clashes. These name -clashes are due to the presence of bfq in mainline from 4.12. - -3. The modification of block/Makefile and block/Kconfig.iosched to -comply with the above renaming. - -Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it> -Signed-off-by: Arianna Avanzini <avanzini@google.com> -Signed-off-by: Linus Walleij <linus.walleij@linaro.org> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Makefile | 2 +- - block/Kconfig.iosched | 31 + - block/bfq-cgroup-included.c | 1190 ++++++++++ - block/bfq-ioc.c | 36 + - block/bfq-sched.c | 2002 ++++++++++++++++ - block/bfq-sq-iosched.c | 5379 +++++++++++++++++++++++++++++++++++++++++++ - block/bfq.h | 948 ++++++++ - include/linux/blkdev.h | 2 +- - 9 files changed, 9589 insertions(+), 2 deletions(-) - create mode 100644 block/bfq-cgroup-included.c - create mode 100644 block/bfq-ioc.c - create mode 100644 block/bfq-sched.c - create mode 100644 block/bfq-sq-iosched.c - create mode 100644 block/bfq.h - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index a4a8914bf7a4..9e3f4c2f7390 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED - ---help--- - Enable group IO scheduling in CFQ. - -+config IOSCHED_BFQ_SQ -+ tristate "BFQ-SQ I/O scheduler" -+ default n -+ ---help--- -+ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for -+ SingleQueue) distributes bandwidth among all processes -+ according to their weights, regardless of the device -+ parameters and with any workload. It also guarantees a low -+ latency to interactive and soft real-time applications. -+ Details in Documentation/block/bfq-iosched.txt -+ -+config BFQ_SQ_GROUP_IOSCHED -+ bool "BFQ-SQ hierarchical scheduling support" -+ depends on IOSCHED_BFQ_SQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-SQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - choice - - prompt "Default I/O scheduler" -@@ -54,6 +74,16 @@ choice - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - -+ config DEFAULT_BFQ_SQ -+ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y -+ help -+ Selects BFQ-SQ as the default I/O scheduler which will be -+ used by default for all block devices. -+ The BFQ-SQ I/O scheduler aims at distributing the bandwidth -+ as desired, independently of the disk parameters and with -+ any workload. It also tries to guarantee low latency to -+ interactive and soft real-time applications. -+ - config DEFAULT_NOOP - bool "No-op" - -@@ -63,6 +93,7 @@ config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ -+ default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - - config MQ_IOSCHED_DEADLINE -diff --git a/block/Makefile b/block/Makefile -index 6a56303b9925..59026b425791 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -24,6 +24,7 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o - obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -+obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -new file mode 100644 -index 000000000000..af7c216a3540 ---- /dev/null -+++ b/block/bfq-cgroup-included.c -@@ -0,0 +1,1190 @@ -+/* -+ * BFQ: CGROUPS support. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ */ -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ -+/* bfqg stats flags */ -+enum bfqg_stats_flags { -+ BFQG_stats_waiting = 0, -+ BFQG_stats_idling, -+ BFQG_stats_empty, -+}; -+ -+#define BFQG_FLAG_FNS(name) \ -+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags |= (1 << BFQG_stats_##name); \ -+} \ -+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags &= ~(1 << BFQG_stats_##name); \ -+} \ -+static int bfqg_stats_##name(struct bfqg_stats *stats) \ -+{ \ -+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -+} \ -+ -+BFQG_FLAG_FNS(waiting) -+BFQG_FLAG_FNS(idling) -+BFQG_FLAG_FNS(empty) -+#undef BFQG_FLAG_FNS -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_waiting(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_group_wait_time)) -+ blkg_stat_add(&stats->group_wait_time, -+ now - stats->start_group_wait_time); -+ bfqg_stats_clear_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_waiting(stats)) -+ return; -+ if (bfqg == curr_bfqg) -+ return; -+ stats->start_group_wait_time = sched_clock(); -+ bfqg_stats_mark_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_empty(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_empty_time)) -+ blkg_stat_add(&stats->empty_time, -+ now - stats->start_empty_time); -+ bfqg_stats_clear_empty(stats); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -+{ -+ blkg_stat_add(&bfqg->stats.dequeue, 1); -+} -+ -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (blkg_rwstat_total(&stats->queued)) -+ return; -+ -+ /* -+ * group is already marked empty. This can happen if bfqq got new -+ * request in parent group and moved to this group while being added -+ * to service tree. Just ignore the event and move on. -+ */ -+ if (bfqg_stats_empty(stats)) -+ return; -+ -+ stats->start_empty_time = sched_clock(); -+ bfqg_stats_mark_empty(stats); -+} -+ -+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_idling(stats)) { -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, stats->start_idle_time)) -+ blkg_stat_add(&stats->idle_time, -+ now - stats->start_idle_time); -+ bfqg_stats_clear_idling(stats); -+ } -+} -+ -+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ stats->start_idle_time = sched_clock(); -+ bfqg_stats_mark_idling(stats); -+} -+ -+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ blkg_stat_add(&stats->avg_queue_size_sum, -+ blkg_rwstat_total(&stats->queued)); -+ blkg_stat_add(&stats->avg_queue_size_samples, 1); -+ bfqg_stats_update_group_wait_time(stats); -+} -+ -+static struct blkcg_policy blkcg_policy_bfq; -+ -+/* -+ * blk-cgroup policy-related handlers -+ * The following functions help in converting between blk-cgroup -+ * internal structures and BFQ-specific structures. -+ */ -+ -+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -+{ -+ return pd ? container_of(pd, struct bfq_group, pd) : NULL; -+} -+ -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -+{ -+ return pd_to_blkg(&bfqg->pd); -+} -+ -+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -+{ -+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); -+ -+ return pd_to_bfqg(pd); -+} -+ -+/* -+ * bfq_group handlers -+ * The following functions help in navigating the bfq_group hierarchy -+ * by allowing to find the parent of a bfq_group or the bfq_group -+ * associated to a bfq_queue. -+ */ -+ -+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -+{ -+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; -+ -+ return pblkg ? blkg_to_bfqg(pblkg) : NULL; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ return group_entity ? container_of(group_entity, struct bfq_group, -+ entity) : -+ bfqq->bfqd->root_group; -+} -+ -+/* -+ * The following two functions handle get and put of a bfq_group by -+ * wrapping the related blk-cgroup hooks. -+ */ -+ -+static void bfqg_get(struct bfq_group *bfqg) -+{ -+ return blkg_get(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_put(struct bfq_group *bfqg) -+{ -+ return blkg_put(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, -+ unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time - start_time); -+} -+ -+/* @stats = 0 */ -+static void bfqg_stats_reset(struct bfqg_stats *stats) -+{ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_reset(&stats->merged); -+ blkg_rwstat_reset(&stats->service_time); -+ blkg_rwstat_reset(&stats->wait_time); -+ blkg_stat_reset(&stats->time); -+ blkg_stat_reset(&stats->avg_queue_size_sum); -+ blkg_stat_reset(&stats->avg_queue_size_samples); -+ blkg_stat_reset(&stats->dequeue); -+ blkg_stat_reset(&stats->group_wait_time); -+ blkg_stat_reset(&stats->idle_time); -+ blkg_stat_reset(&stats->empty_time); -+} -+ -+/* @to += @from */ -+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) -+{ -+ if (!to || !from) -+ return; -+ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_add_aux(&to->merged, &from->merged); -+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); -+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); -+ blkg_stat_add_aux(&from->time, &from->time); -+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); -+ blkg_stat_add_aux(&to->avg_queue_size_samples, -+ &from->avg_queue_size_samples); -+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); -+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); -+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); -+ blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+} -+ -+/* -+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' -+ * recursive stats can still account for the amount used by this bfqg after -+ * it's gone. -+ */ -+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -+{ -+ struct bfq_group *parent; -+ -+ if (!bfqg) /* root_group */ -+ return; -+ -+ parent = bfqg_parent(bfqg); -+ -+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); -+ -+ if (unlikely(!parent)) -+ return; -+ -+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ bfqg_get(bfqg); -+ } -+ entity->parent = bfqg->my_entity; /* NULL for root group */ -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfqg_stats_exit(struct bfqg_stats *stats) -+{ -+ blkg_rwstat_exit(&stats->merged); -+ blkg_rwstat_exit(&stats->service_time); -+ blkg_rwstat_exit(&stats->wait_time); -+ blkg_rwstat_exit(&stats->queued); -+ blkg_stat_exit(&stats->time); -+ blkg_stat_exit(&stats->avg_queue_size_sum); -+ blkg_stat_exit(&stats->avg_queue_size_samples); -+ blkg_stat_exit(&stats->dequeue); -+ blkg_stat_exit(&stats->group_wait_time); -+ blkg_stat_exit(&stats->idle_time); -+ blkg_stat_exit(&stats->empty_time); -+} -+ -+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -+{ -+ if (blkg_rwstat_init(&stats->merged, gfp) || -+ blkg_rwstat_init(&stats->service_time, gfp) || -+ blkg_rwstat_init(&stats->wait_time, gfp) || -+ blkg_rwstat_init(&stats->queued, gfp) || -+ blkg_stat_init(&stats->time, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || -+ blkg_stat_init(&stats->dequeue, gfp) || -+ blkg_stat_init(&stats->group_wait_time, gfp) || -+ blkg_stat_init(&stats->idle_time, gfp) || -+ blkg_stat_init(&stats->empty_time, gfp)) { -+ bfqg_stats_exit(stats); -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -+{ -+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -+} -+ -+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -+{ -+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -+} -+ -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), gfp); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ -+static void bfq_cpd_init(struct blkcg_policy_data *cpd) -+{ -+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); -+ -+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? -+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); -+} -+ -+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -+{ -+ struct bfq_group *bfqg; -+ -+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); -+ if (!bfqg) -+ return NULL; -+ -+ if (bfqg_stats_init(&bfqg->stats, gfp)) { -+ kfree(bfqg); -+ return NULL; -+ } -+ -+ return &bfqg->pd; -+} -+ -+static void bfq_pd_init(struct blkg_policy_data *pd) -+{ -+ struct blkcg_gq *blkg; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ struct bfq_group_data *d; -+ -+ blkg = pd_to_blkg(pd); -+ BUG_ON(!blkg); -+ bfqg = blkg_to_bfqg(blkg); -+ bfqd = blkg->q->elevator->elevator_data; -+ entity = &bfqg->entity; -+ d = blkcg_to_bfqgd(blkg->blkcg); -+ -+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; -+ entity->my_sched_data = &bfqg->sched_data; -+ bfqg->my_entity = entity; /* -+ * the root_group's will be set to NULL -+ * in bfq_init_queue() -+ */ -+ bfqg->bfqd = bfqd; -+ bfqg->active_entities = 0; -+ bfqg->rq_pos_tree = RB_ROOT; -+} -+ -+static void bfq_pd_free(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_exit(&bfqg->stats); -+ return kfree(bfqg); -+} -+ -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) -+{ -+ struct bfq_entity *entity; -+ -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; -+} -+ -+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct blkcg_gq *blkg; -+ -+ blkg = blkg_lookup(blkcg, bfqd->queue); -+ if (likely(blkg)) -+ return blkg_to_bfqg(blkg); -+ return NULL; -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct bfq_group *bfqg, *parent; -+ struct bfq_entity *entity; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_lookup_bfqg(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ return NULL; -+ -+ /* -+ * Update chain of bfq_groups as we might be handling a leaf group -+ * which, along with some of its relatives, has not been hooked yet -+ * to the private hierarchy of BFQ. -+ */ -+ entity = &bfqg->entity; -+ for_each_entity(entity) { -+ bfqg = container_of(entity, struct bfq_group, entity); -+ BUG_ON(!bfqg); -+ if (bfqg != bfqd->root_group) { -+ parent = bfqg_parent(bfqg); -+ if (!parent) -+ parent = bfqd->root_group; -+ BUG_ON(!parent); -+ bfq_group_set_parent(bfqg, parent); -+ } -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/** -+ * bfq_bfqq_move - migrate @bfqq to @bfqg. -+ * @bfqd: queue descriptor. -+ * @bfqq: the queue to move. -+ * @bfqg: the group to move to. -+ * -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -+ * it on the new one. Avoid putting the entity on the old group idle tree. -+ * -+ * Must be called under the queue lock; the cgroup owning @bfqg must -+ * not disappear (by now this just means that we are called under -+ * rcu_read_lock()). -+ */ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); -+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) -+ && entity->on_st && -+ bfqq != bfqd->in_service_queue); -+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); -+ -+ /* If bfqq is empty, then bfq_bfqq_expire also invokes -+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity -+ * from data structures related to current group. Otherwise we -+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as -+ * we do below. -+ */ -+ if (bfqq == bfqd->in_service_queue) -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_busy(bfqq)) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ else if (entity->on_st) { -+ BUG_ON(&bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ } -+ bfqg_put(bfqq_group(bfqq)); -+ -+ /* -+ * Here we use a reference to bfqg. We don't need a refcounter -+ * as the cgroup reference will not be dropped, so that its -+ * destroy() callback will not be invoked. -+ */ -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+ bfqg_get(bfqg); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ if (bfq_bfqq_busy(bfqq)) { -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+ -+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+} -+ -+/** -+ * __bfq_bic_change_cgroup - move @bic to @cgroup. -+ * @bfqd: the queue descriptor. -+ * @bic: the bic to move. -+ * @blkcg: the blk-cgroup to move to. -+ * -+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller -+ * has to make sure that the reference to cgroup is valid across the call. -+ * -+ * NOTE: an alternative approach might have been to store the current -+ * cgroup in bfqq and getting a reference to it, reducing the lookup -+ * time here, at the price of slightly more complex code. -+ */ -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct blkcg *blkcg) -+{ -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); -+ struct bfq_group *bfqg; -+ struct bfq_entity *entity; -+ -+ lockdep_assert_held(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_find_set_group(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ bfqg = bfqd->root_group; -+ -+ if (async_bfqq) { -+ entity = &async_bfqq->entity; -+ -+ if (entity->sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, 0); -+ bfq_log_bfqq(bfqd, async_bfqq, -+ "bic_change_group: %p %d", -+ async_bfqq, -+ async_bfqq->ref); -+ bfq_put_queue(async_bfqq); -+ } -+ } -+ -+ if (sync_bfqq) { -+ entity = &sync_bfqq->entity; -+ if (entity->sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_group *bfqg = NULL; -+ uint64_t serial_nr; -+ -+ rcu_read_lock(); -+ serial_nr = bio_blkcg(bio)->css.serial_nr; -+ -+ /* -+ * Check whether blkcg has changed. The condition may trigger -+ * spuriously on a newly created cic but there's no harm. -+ */ -+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) -+ goto out; -+ -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+ bic->blkcg_serial_nr = serial_nr; -+out: -+ rcu_read_unlock(); -+} -+ -+/** -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. -+ * @st: the service tree being flushed. -+ */ -+static void bfq_flush_idle_tree(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entity = st->first_idle; -+ -+ for (; entity ; entity = st->first_idle) -+ __bfq_deactivate_entity(entity, false); -+} -+ -+/** -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. -+ * @bfqd: the device data structure with the root group. -+ * @entity: the entity to move. -+ */ -+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(!bfqq); -+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); -+} -+ -+/** -+ * bfq_reparent_active_entities - move to the root group all active -+ * entities. -+ * @bfqd: the device data structure with the root group. -+ * @bfqg: the group to move from. -+ * @st: the service tree with the entities. -+ * -+ * Needs queue_lock to be taken and reference to be valid over the call. -+ */ -+static void bfq_reparent_active_entities(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ struct bfq_service_tree *st) -+{ -+ struct rb_root *active = &st->active; -+ struct bfq_entity *entity = NULL; -+ -+ if (!RB_EMPTY_ROOT(&st->active)) -+ entity = bfq_entity_of(rb_first(active)); -+ -+ for (; entity ; entity = bfq_entity_of(rb_first(active))) -+ bfq_reparent_leaf_entity(bfqd, entity); -+ -+ if (bfqg->sched_data.in_service_entity) -+ bfq_reparent_leaf_entity(bfqd, -+ bfqg->sched_data.in_service_entity); -+} -+ -+/** -+ * bfq_pd_offline - deactivate the entity associated with @pd, -+ * and reparent its children entities. -+ * @pd: descriptor of the policy going offline. -+ * -+ * blkio already grabs the queue_lock for us, so no need to use -+ * RCU-based magic -+ */ -+static void bfq_pd_offline(struct blkg_policy_data *pd) -+{ -+ struct bfq_service_tree *st; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ int i; -+ -+ BUG_ON(!pd); -+ bfqg = pd_to_bfqg(pd); -+ BUG_ON(!bfqg); -+ bfqd = bfqg->bfqd; -+ BUG_ON(bfqd && !bfqd->root_group); -+ -+ entity = bfqg->my_entity; -+ -+ if (!entity) /* root group */ -+ return; -+ -+ /* -+ * Empty all service_trees belonging to this group before -+ * deactivating the group itself. -+ */ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ BUG_ON(!bfqg->sched_data.service_tree); -+ st = bfqg->sched_data.service_tree + i; -+ /* -+ * The idle tree may still contain bfq_queues belonging -+ * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. No one else -+ * can access them so it's safe to act without any lock. -+ */ -+ bfq_flush_idle_tree(st); -+ -+ /* -+ * It may happen that some queues are still active -+ * (busy) upon group destruction (if the corresponding -+ * processes have been forced to terminate). We move -+ * all the leaf entities corresponding to these queues -+ * to the root_group. -+ * Also, it may happen that the group has an entity -+ * in service, which is disconnected from the active -+ * tree: it must be moved, too. -+ * There is no need to put the sync queues, as the -+ * scheduler has taken no reference. -+ */ -+ bfq_reparent_active_entities(bfqd, bfqg, st); -+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); -+ } -+ BUG_ON(bfqg->sched_data.next_in_service); -+ BUG_ON(bfqg->sched_data.in_service_entity); -+ -+ __bfq_deactivate_entity(entity, false); -+ bfq_put_async_queues(bfqd, bfqg); -+ -+ /* -+ * @blkg is going offline and will be ignored by -+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -+ * that they don't get lost. If IOs complete after this point, the -+ * stats for them will be lost. Oh well... -+ */ -+ bfqg_stats_xfer_dead(bfqg); -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ struct blkcg_gq *blkg; -+ -+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ BUG_ON(!bfqg); -+ -+ bfq_end_wr_async_queues(bfqd, bfqg); -+ } -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static int bfq_io_show_weight(struct seq_file *sf, void *v) -+{ -+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ unsigned int val = 0; -+ -+ if (bfqgd) -+ val = bfqgd->weight; -+ -+ seq_printf(sf, "%u\n", val); -+ -+ return 0; -+} -+ -+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ struct blkcg_gq *blkg; -+ int ret = -ERANGE; -+ -+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) -+ return ret; -+ -+ ret = 0; -+ spin_lock_irq(&blkcg->lock); -+ bfqgd->weight = (unsigned short)val; -+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ if (!bfqg) -+ continue; -+ /* -+ * Setting the prio_changed flag of the entity -+ * to 1 with new_weight == weight would re-set -+ * the value of the weight to its ioprio mapping. -+ * Set the flag only if necessary. -+ */ -+ if ((unsigned short)val != bfqg->entity.new_weight) { -+ bfqg->entity.new_weight = (unsigned short)val; -+ /* -+ * Make sure that the above new value has been -+ * stored in bfqg->entity.new_weight before -+ * setting the prio_changed flag. In fact, -+ * this flag may be read asynchronously (in -+ * critical sections protected by a different -+ * lock than that held here), and finding this -+ * flag set may cause the execution of the code -+ * for updating parameters whose value may -+ * depend also on bfqg->entity.new_weight (in -+ * __bfq_entity_update_weight_prio). -+ * This barrier makes sure that the new value -+ * of bfqg->entity.new_weight is correctly -+ * seen in that code. -+ */ -+ smp_wmb(); -+ bfqg->entity.prio_changed = 1; -+ } -+ } -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) -+{ -+ u64 weight; -+ /* First unsigned long found in the file is used */ -+ int ret = kstrtoull(strim(buf), 0, &weight); -+ -+ if (ret) -+ return ret; -+ -+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); -+} -+ -+static int bfqg_print_stat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, off); -+ return __blkg_prfill_u64(sf, pd, sum); -+} -+ -+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, -+ off); -+ return __blkg_prfill_rwstat(sf, pd, &sum); -+} -+ -+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, -+ int off) -+{ -+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, -+ offsetof(struct blkcg_gq, stat_bytes)); -+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + -+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, -+ false); -+ return 0; -+} -+ -+ -+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); -+ u64 v = 0; -+ -+ if (samples) { -+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); -+ v = div64_u64(v, samples); -+ } -+ __blkg_prfill_u64(sf, pd, v); -+ return 0; -+} -+ -+/* print avg_queue_size */ -+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, -+ 0, false); -+ return 0; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ int ret; -+ -+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); -+ if (ret) -+ return NULL; -+ -+ return blkg_to_bfqg(bfqd->queue->root_blkg); -+} -+ -+static struct cftype bfq_blkcg_legacy_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write_u64 = bfq_io_set_weight_legacy, -+ }, -+ -+ /* statistics, covers only the tasks in the bfqg */ -+ { -+ .name = "bfq.time", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.sectors", -+ .seq_show = bfqg_print_stat_sectors, -+ }, -+ { -+ .name = "bfq.io_service_bytes", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes, -+ }, -+ { -+ .name = "bfq.io_serviced", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios, -+ }, -+ { -+ .name = "bfq.io_service_time", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_wait_time", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_merged", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_queued", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ -+ /* the same statictics which cover the bfqg and its descendants */ -+ { -+ .name = "bfq.time_recursive", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = "bfq.sectors_recursive", -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, -+ { -+ .name = "bfq.io_service_bytes_recursive", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes_recursive, -+ }, -+ { -+ .name = "bfq.io_serviced_recursive", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios_recursive, -+ }, -+ { -+ .name = "bfq.io_service_time_recursive", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_wait_time_recursive", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_merged_recursive", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_queued_recursive", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.avg_queue_size", -+ .seq_show = bfqg_print_avg_queue_size, -+ }, -+ { -+ .name = "bfq.group_wait_time", -+ .private = offsetof(struct bfq_group, stats.group_wait_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.idle_time", -+ .private = offsetof(struct bfq_group, stats.idle_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.empty_time", -+ .private = offsetof(struct bfq_group, stats.empty_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.dequeue", -+ .private = offsetof(struct bfq_group, stats.dequeue), -+ .seq_show = bfqg_print_stat, -+ }, -+ { } /* terminate */ -+}; -+ -+static struct cftype bfq_blkg_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write = bfq_io_set_weight, -+ }, -+ {} /* terminate */ -+}; -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) {} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ } -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ return bfqd->root_group; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ struct bfq_group *bfqg; -+ int i; -+ -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); -+ if (!bfqg) -+ return NULL; -+ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ -+ return bfqg; -+} -+#endif -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c -new file mode 100644 -index 000000000000..fb7bb8f08b75 ---- /dev/null -+++ b/block/bfq-ioc.c -@@ -0,0 +1,36 @@ -+/* -+ * BFQ: I/O context handling. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> -+ */ -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * -+ * Queue lock must be held. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc) -+{ -+ if (ioc) -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); -+ return NULL; -+} -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -new file mode 100644 -index 000000000000..ac8991bca9fa ---- /dev/null -+++ b/block/bfq-sched.c -@@ -0,0 +1,2002 @@ -+/* -+ * BFQ: Hierarchical B-WF2Q+ scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -+{ -+ struct rb_node *node = tree->rb_node; -+ -+ return rb_entry(node, struct bfq_entity, rb_node); -+} -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); -+ -+/** -+ * bfq_update_next_in_service - update sd->next_in_service -+ * @sd: sched_data for which to perform the update. -+ * @new_entity: if not NULL, pointer to the entity whose activation, -+ * requeueing or repositionig triggered the invocation of -+ * this function. -+ * -+ * This function is called to update sd->next_in_service, which, in -+ * its turn, may change as a consequence of the insertion or -+ * extraction of an entity into/from one of the active trees of -+ * sd. These insertions/extractions occur as a consequence of -+ * activations/deactivations of entities, with some activations being -+ * 'true' activations, and other activations being requeueings (i.e., -+ * implementing the second, requeueing phase of the mechanism used to -+ * reposition an entity in its active tree; see comments on -+ * __bfq_activate_entity and __bfq_requeue_entity for details). In -+ * both the last two activation sub-cases, new_entity points to the -+ * just activated or requeued entity. -+ * -+ * Returns true if sd->next_in_service changes in such a way that -+ * entity->parent may become the next_in_service for its parent -+ * entity. -+ */ -+static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *new_entity) -+{ -+ struct bfq_entity *next_in_service = sd->next_in_service; -+ struct bfq_queue *bfqq; -+ bool parent_sched_may_change = false; -+ -+ /* -+ * If this update is triggered by the activation, requeueing -+ * or repositiong of an entity that does not coincide with -+ * sd->next_in_service, then a full lookup in the active tree -+ * can be avoided. In fact, it is enough to check whether the -+ * just-modified entity has a higher priority than -+ * sd->next_in_service, or, even if it has the same priority -+ * as sd->next_in_service, is eligible and has a lower virtual -+ * finish time than sd->next_in_service. If this compound -+ * condition holds, then the new entity becomes the new -+ * next_in_service. Otherwise no change is needed. -+ */ -+ if (new_entity && new_entity != sd->next_in_service) { -+ /* -+ * Flag used to decide whether to replace -+ * sd->next_in_service with new_entity. Tentatively -+ * set to true, and left as true if -+ * sd->next_in_service is NULL. -+ */ -+ bool replace_next = true; -+ -+ /* -+ * If there is already a next_in_service candidate -+ * entity, then compare class priorities or timestamps -+ * to decide whether to replace sd->service_tree with -+ * new_entity. -+ */ -+ if (next_in_service) { -+ unsigned int new_entity_class_idx = -+ bfq_class_idx(new_entity); -+ struct bfq_service_tree *st = -+ sd->service_tree + new_entity_class_idx; -+ -+ /* -+ * For efficiency, evaluate the most likely -+ * sub-condition first. -+ */ -+ replace_next = -+ (new_entity_class_idx == -+ bfq_class_idx(next_in_service) -+ && -+ !bfq_gt(new_entity->start, st->vtime) -+ && -+ bfq_gt(next_in_service->finish, -+ new_entity->finish)) -+ || -+ new_entity_class_idx < -+ bfq_class_idx(next_in_service); -+ } -+ -+ if (replace_next) -+ next_in_service = new_entity; -+ } else /* invoked because of a deactivation: lookup needed */ -+ next_in_service = bfq_lookup_next_entity(sd); -+ -+ if (next_in_service) { -+ parent_sched_may_change = !sd->next_in_service || -+ bfq_update_parent_budget(next_in_service); -+ } -+ -+ sd->next_in_service = next_in_service; -+ -+ if (!next_in_service) -+ return parent_sched_may_change; -+ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chosen this queue"); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_next_in_service: chosen this entity"); -+ } -+#endif -+ return parent_sched_may_change; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* both next loops stop at one of the child entities of the root group */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+/* -+ * For each iteration, compute parent in advance, so as to be safe if -+ * entity is deallocated during the iteration. Such a deallocation may -+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue -+ * containing entity. -+ */ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+/* -+ * Returns true if this budget changes may let next_in_service->parent -+ * become the next_in_service entity for its parent entity. -+ */ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ struct bfq_entity *bfqg_entity; -+ struct bfq_group *bfqg; -+ struct bfq_sched_data *group_sd; -+ bool ret = false; -+ -+ BUG_ON(!next_in_service); -+ -+ group_sd = next_in_service->sched_data; -+ -+ bfqg = container_of(group_sd, struct bfq_group, sched_data); -+ /* -+ * bfq_group's my_entity field is not NULL only if the group -+ * is not the root group. We must not touch the root entity -+ * as it must never become an in-service entity. -+ */ -+ bfqg_entity = bfqg->my_entity; -+ if (bfqg_entity) { -+ if (bfqg_entity->budget > next_in_service->budget) -+ ret = true; -+ bfqg_entity->budget = next_in_service->budget; -+ } -+ -+ return ret; -+} -+ -+/* -+ * This function tells whether entity stops being a candidate for next -+ * service, according to the following logic. -+ * -+ * This function is invoked for an entity that is about to be set in -+ * service. If such an entity is a queue, then the entity is no longer -+ * a candidate for next service (i.e, a candidate entity to serve -+ * after the in-service entity is expired). The function then returns -+ * true. -+ * -+ * In contrast, the entity could stil be a candidate for next service -+ * if it is not a queue, and has more than one child. In fact, even if -+ * one of its children is about to be set in service, other children -+ * may still be the next to serve. As a consequence, a non-queue -+ * entity is not a candidate for next-service only if it has only one -+ * child. And only if this condition holds, then the function returns -+ * true for a non-queue entity. -+ */ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ struct bfq_group *bfqg; -+ -+ if (bfq_entity_to_bfqq(entity)) -+ return true; -+ -+ bfqg = container_of(entity, struct bfq_group, entity); -+ -+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); -+ BUG_ON(bfqg->active_entities == 0); -+ if (bfqg->active_entities == 1) -+ return true; -+ -+ return false; -+} -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = NULL) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (parent = NULL; entity ; entity = parent) -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ return false; -+} -+ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ return true; -+} -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+/* -+ * Shift for timestamp calculations. This actually limits the maximum -+ * service allowed in one timestamp delta (small shift values increase it), -+ * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time -+ * wraparounds. -+ */ -+#define WFQ_SERVICE_SHIFT 22 -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = NULL; -+ -+ BUG_ON(!entity); -+ -+ if (!entity->my_sched_data) -+ bfqq = container_of(entity, struct bfq_queue, entity); -+ -+ return bfqq; -+} -+ -+ -+/** -+ * bfq_delta - map service into the virtual time domain. -+ * @service: amount of service. -+ * @weight: scale factor (weight of an entity or weight sum). -+ */ -+static u64 bfq_delta(unsigned long service, unsigned long weight) -+{ -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; -+ -+ do_div(d, weight); -+ return d; -+} -+ -+/** -+ * bfq_calc_finish - assign the finish time to an entity. -+ * @entity: the entity to act upon. -+ * @service: the service to be charged to the entity. -+ */ -+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned long long start, finish, delta; -+ -+ BUG_ON(entity->weight == 0); -+ -+ entity->finish = entity->start + -+ bfq_delta(service, entity->weight); -+ -+ start = ((entity->start>>10)*1000)>>12; -+ finish = ((entity->finish>>10)*1000)>>12; -+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#endif -+ } -+} -+ -+/** -+ * bfq_entity_of - get an entity from a node. -+ * @node: the node field of the entity. -+ * -+ * Convert a node pointer to the relative entity. This is used only -+ * to simplify the logic of some functions and not as the generic -+ * conversion mechanism because, e.g., in the tree walking functions, -+ * the check for a %NULL value would be redundant. -+ */ -+static struct bfq_entity *bfq_entity_of(struct rb_node *node) -+{ -+ struct bfq_entity *entity = NULL; -+ -+ if (node) -+ entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ return entity; -+} -+ -+/** -+ * bfq_extract - remove an entity from a tree. -+ * @root: the tree root. -+ * @entity: the entity to remove. -+ */ -+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -+{ -+ BUG_ON(entity->tree != root); -+ -+ entity->tree = NULL; -+ rb_erase(&entity->rb_node, root); -+} -+ -+/** -+ * bfq_idle_extract - extract an entity from the idle tree. -+ * @st: the service tree of the owning @entity. -+ * @entity: the entity being removed. -+ */ -+static void bfq_idle_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *next; -+ -+ BUG_ON(entity->tree != &st->idle); -+ -+ if (entity == st->first_idle) { -+ next = rb_next(&entity->rb_node); -+ st->first_idle = bfq_entity_of(next); -+ } -+ -+ if (entity == st->last_idle) { -+ next = rb_prev(&entity->rb_node); -+ st->last_idle = bfq_entity_of(next); -+ } -+ -+ bfq_extract(&st->idle, entity); -+ -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+} -+ -+/** -+ * bfq_insert - generic tree insertion. -+ * @root: tree root. -+ * @entity: entity to insert. -+ * -+ * This is used for the idle and the active tree, since they are both -+ * ordered by finish time. -+ */ -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -+{ -+ struct bfq_entity *entry; -+ struct rb_node **node = &root->rb_node; -+ struct rb_node *parent = NULL; -+ -+ BUG_ON(entity->tree); -+ -+ while (*node) { -+ parent = *node; -+ entry = rb_entry(parent, struct bfq_entity, rb_node); -+ -+ if (bfq_gt(entry->finish, entity->finish)) -+ node = &parent->rb_left; -+ else -+ node = &parent->rb_right; -+ } -+ -+ rb_link_node(&entity->rb_node, parent, node); -+ rb_insert_color(&entity->rb_node, root); -+ -+ entity->tree = root; -+} -+ -+/** -+ * bfq_update_min - update the min_start field of a entity. -+ * @entity: the entity to update. -+ * @node: one of its children. -+ * -+ * This function is called when @entity may store an invalid value for -+ * min_start due to updates to the active tree. The function assumes -+ * that the subtree rooted at @node (which may be its left or its right -+ * child) has a valid min_start value. -+ */ -+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -+{ -+ struct bfq_entity *child; -+ -+ if (node) { -+ child = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entity->min_start, child->min_start)) -+ entity->min_start = child->min_start; -+ } -+} -+ -+/** -+ * bfq_update_active_node - recalculate min_start. -+ * @node: the node to update. -+ * -+ * @node may have changed position or one of its children may have moved, -+ * this function updates its min_start value. The left and right subtrees -+ * are assumed to hold a correct min_start value. -+ */ -+static void bfq_update_active_node(struct rb_node *node) -+{ -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->min_start = entity->start; -+ bfq_update_min(entity, node->rb_right); -+ bfq_update_min(entity, node->rb_left); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#endif -+ } -+} -+ -+/** -+ * bfq_update_active_tree - update min_start for the whole active tree. -+ * @node: the starting node. -+ * -+ * @node must be the deepest modified node after an update. This function -+ * updates its min_start using the values held by its children, assuming -+ * that they did not change, and then updates all the nodes that may have -+ * changed in the path to the root. The only nodes that may have changed -+ * are the ones in the path or their siblings. -+ */ -+static void bfq_update_active_tree(struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+up: -+ bfq_update_active_node(node); -+ -+ parent = rb_parent(node); -+ if (!parent) -+ return; -+ -+ if (node == parent->rb_left && parent->rb_right) -+ bfq_update_active_node(parent->rb_right); -+ else if (parent->rb_left) -+ bfq_update_active_node(parent->rb_left); -+ -+ node = parent; -+ goto up; -+} -+ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+ -+/** -+ * bfq_active_insert - insert an entity in the active tree of its -+ * group/device. -+ * @st: the service tree of the entity. -+ * @entity: the entity being inserted. -+ * -+ * The active tree is ordered by finish time, but an extra key is kept -+ * per each node, containing the minimum value for the start times of -+ * its children (and the node itself), so it's possible to search for -+ * the eligible node with the lowest finish time in logarithmic time. -+ */ -+static void bfq_active_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node = &entity->rb_node; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ bfq_insert(&st->active, entity); -+ -+ if (node->rb_left) -+ node = node->rb_left; -+ else if (node->rb_right) -+ node = node->rb_right; -+ -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ bfqg->active_entities++; -+ } -+#endif -+} -+ -+/** -+ * bfq_ioprio_to_weight - calc a weight from an ioprio. -+ * @ioprio: the ioprio value to convert. -+ */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) -+{ -+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; -+} -+ -+/** -+ * bfq_weight_to_ioprio - calc an ioprio from a weight. -+ * @weight: the weight value to convert. -+ * -+ * To preserve as much as possible the old only-ioprio user interface, -+ * 0 is used as an escape ioprio value for weights (numerically) equal or -+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. -+ */ -+static unsigned short bfq_weight_to_ioprio(int weight) -+{ -+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? -+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -+} -+ -+static void bfq_get_entity(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ if (bfqq) { -+ bfqq->ref++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfqq, bfqq->ref); -+ } -+} -+ -+/** -+ * bfq_find_deepest - find the deepest node that an extraction can modify. -+ * @node: the node being removed. -+ * -+ * Do the first step of an extraction in an rb tree, looking for the -+ * node that will replace @node, and returning the deepest node that -+ * the following modifications to the tree can touch. If @node is the -+ * last node in the tree return %NULL. -+ */ -+static struct rb_node *bfq_find_deepest(struct rb_node *node) -+{ -+ struct rb_node *deepest; -+ -+ if (!node->rb_right && !node->rb_left) -+ deepest = rb_parent(node); -+ else if (!node->rb_right) -+ deepest = node->rb_left; -+ else if (!node->rb_left) -+ deepest = node->rb_right; -+ else { -+ deepest = rb_next(node); -+ if (deepest->rb_right) -+ deepest = deepest->rb_right; -+ else if (rb_parent(deepest) != node) -+ deepest = rb_parent(deepest); -+ } -+ -+ return deepest; -+} -+ -+/** -+ * bfq_active_extract - remove an entity from the active tree. -+ * @st: the service_tree containing the tree. -+ * @entity: the entity being removed. -+ */ -+static void bfq_active_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ node = bfq_find_deepest(&entity->rb_node); -+ bfq_extract(&st->active, entity); -+ -+ if (node) -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_remove(bfqd, entity, -+ &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ BUG_ON(!bfqg->active_entities); -+ bfqg->active_entities--; -+ } -+#endif -+} -+ -+/** -+ * bfq_idle_insert - insert an entity into the idle tree. -+ * @st: the service tree containing the tree. -+ * @entity: the entity to insert. -+ */ -+static void bfq_idle_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) -+ st->first_idle = entity; -+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) -+ st->last_idle = entity; -+ -+ bfq_insert(&st->idle, entity); -+ -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -+} -+ -+/** -+ * bfq_forget_entity - do not consider entity any longer for scheduling -+ * @st: the service tree. -+ * @entity: the entity being removed. -+ * @is_in_service: true if entity is currently the in-service entity. -+ * -+ * Forget everything about @entity. In addition, if entity represents -+ * a queue, and the latter is not in service, then release the service -+ * reference to the queue (the one taken through bfq_get_entity). In -+ * fact, in this case, there is really no more service reference to -+ * the queue, as the latter is also outside any service tree. If, -+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service -+ * will take care of putting the reference when the queue finally -+ * stops being served. -+ */ -+static void bfq_forget_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity, -+ bool is_in_service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!entity->on_st); -+ -+ entity->on_st = false; -+ st->wsum -= entity->weight; -+ if (bfqq && !is_in_service) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/** -+ * bfq_put_idle_entity - release the idle tree ref of an entity. -+ * @st: service tree for the entity. -+ * @entity: the entity being released. -+ */ -+static void bfq_put_idle_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ bfq_idle_extract(st, entity); -+ bfq_forget_entity(st, entity, -+ entity == entity->sched_data->in_service_entity); -+} -+ -+/** -+ * bfq_forget_idle - update the idle tree if necessary. -+ * @st: the service tree to act upon. -+ * -+ * To preserve the global O(log N) complexity we only remove one entry here; -+ * as the idle tree will not grow indefinitely this can be done safely. -+ */ -+static void bfq_forget_idle(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (RB_EMPTY_ROOT(&st->active) && last_idle && -+ !bfq_gt(last_idle->finish, st->vtime)) { -+ /* -+ * Forget the whole idle tree, increasing the vtime past -+ * the last finish time of idle entities. -+ */ -+ st->vtime = last_idle->finish; -+ } -+ -+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) -+ bfq_put_idle_entity(st, first_idle); -+} -+ -+/* -+ * Update weight and priority of entity. If update_class_too is true, -+ * then update the ioprio_class of entity too. -+ * -+ * The reason why the update of ioprio_class is controlled through the -+ * last parameter is as follows. Changing the ioprio class of an -+ * entity implies changing the destination service trees for that -+ * entity. If such a change occurred when the entity is already on one -+ * of the service trees for its previous class, then the state of the -+ * entity would become more complex: none of the new possible service -+ * trees for the entity, according to bfq_entity_service_tree(), would -+ * match any of the possible service trees on which the entity -+ * is. Complex operations involving these trees, such as entity -+ * activations and deactivations, should take into account this -+ * additional complexity. To avoid this issue, this function is -+ * invoked with update_class_too unset in the points in the code where -+ * entity may happen to be on some tree. -+ */ -+static struct bfq_service_tree * -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, -+ struct bfq_entity *entity, -+ bool update_class_too) -+{ -+ struct bfq_service_tree *new_st = old_st; -+ -+ if (entity->prio_changed) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int prev_weight, new_weight; -+ struct bfq_data *bfqd = NULL; -+ struct rb_root *root; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd; -+ struct bfq_group *bfqg; -+#endif -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ sd = entity->my_sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+ BUG_ON(!bfqd); -+ } -+#endif -+ -+ BUG_ON(old_st->wsum < entity->weight); -+ old_st->wsum -= entity->weight; -+ -+ if (entity->new_weight != entity->orig_weight) { -+ if (entity->new_weight < BFQ_MIN_WEIGHT || -+ entity->new_weight > BFQ_MAX_WEIGHT) { -+ pr_crit("update_weight_prio: new_weight %d\n", -+ entity->new_weight); -+ if (entity->new_weight < BFQ_MIN_WEIGHT) -+ entity->new_weight = BFQ_MIN_WEIGHT; -+ else -+ entity->new_weight = BFQ_MAX_WEIGHT; -+ } -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) -+ bfqq->ioprio = -+ bfq_weight_to_ioprio(entity->orig_weight); -+ } -+ -+ if (bfqq && update_class_too) -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ -+ /* -+ * Reset prio_changed only if the ioprio_class change -+ * is not pending any longer. -+ */ -+ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) -+ entity->prio_changed = 0; -+ -+ /* -+ * NOTE: here we may be changing the weight too early, -+ * this will cause unfairness. The correct approach -+ * would have required additional complexity to defer -+ * weight changes to the proper time instants (i.e., -+ * when entity->finish <= old_st->vtime). -+ */ -+ new_st = bfq_entity_service_tree(entity); -+ -+ prev_weight = entity->weight; -+ new_weight = entity->orig_weight * -+ (bfqq ? bfqq->wr_coeff : 1); -+ /* -+ * If the weight of the entity changes, remove the entity -+ * from its old weight counter (if there is a counter -+ * associated with the entity), and add it to the counter -+ * associated with its new weight. -+ */ -+ if (prev_weight != new_weight) { -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "weight changed %d %d(%d %d)", -+ prev_weight, new_weight, -+ entity->orig_weight, -+ bfqq->wr_coeff); -+ -+ root = bfqq ? &bfqd->queue_weights_tree : -+ &bfqd->group_weights_tree; -+ bfq_weights_tree_remove(bfqd, entity, root); -+ } -+ entity->weight = new_weight; -+ /* -+ * Add the entity to its weights tree only if it is -+ * not associated with a weight-raised queue. -+ */ -+ if (prev_weight != new_weight && -+ (bfqq ? bfqq->wr_coeff == 1 : 1)) -+ /* If we get here, root has been initialized. */ -+ bfq_weights_tree_add(bfqd, entity, root); -+ -+ new_st->wsum += entity->weight; -+ -+ if (new_st != old_st) -+ entity->start = new_st->vtime; -+ } -+ -+ return new_st; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -+#endif -+ -+/** -+ * bfq_bfqq_served - update the scheduler status after selection for -+ * service. -+ * @bfqq: the queue being served. -+ * @served: bytes to transfer. -+ * -+ * NOTE: this can be optimized, as the timestamps of upper level entities -+ * are synchronized every time a new bfqq is selected for service. By now, -+ * we keep it to better check consistency. -+ */ -+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st; -+ -+ for_each_entity(entity) { -+ st = bfq_entity_service_tree(entity); -+ -+ entity->service += served; -+ -+ BUG_ON(st->wsum == 0); -+ -+ st->vtime += bfq_delta(served, st->wsum); -+ bfq_forget_idle(st); -+ } -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -+#endif -+ st = bfq_entity_service_tree(&bfqq->entity); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", -+ served, ((st->vtime>>10)*1000)>>12, st); -+} -+ -+/** -+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length -+ * of the time interval during which bfqq has been in -+ * service. -+ * @bfqd: the device -+ * @bfqq: the queue that needs a service update. -+ * @time_ms: the amount of time during which the queue has received service -+ * -+ * If a queue does not consume its budget fast enough, then providing -+ * the queue with service fairness may impair throughput, more or less -+ * severely. For this reason, queues that consume their budget slowly -+ * are provided with time fairness instead of service fairness. This -+ * goal is achieved through the BFQ scheduling engine, even if such an -+ * engine works in the service, and not in the time domain. The trick -+ * is charging these queues with an inflated amount of service, equal -+ * to the amount of service that they would have received during their -+ * service slot if they had been fast, i.e., if their requests had -+ * been dispatched at a rate equal to the estimated peak rate. -+ * -+ * It is worth noting that time fairness can cause important -+ * distortions in terms of bandwidth distribution, on devices with -+ * internal queueing. The reason is that I/O requests dispatched -+ * during the service slot of a queue may be served after that service -+ * slot is finished, and may have a total processing time loosely -+ * correlated with the duration of the service slot. This is -+ * especially true for short service slots. -+ */ -+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ unsigned long time_ms) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ int tot_serv_to_charge = entity->service; -+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); -+ -+ if (time_ms > 0 && time_ms < timeout_ms) -+ tot_serv_to_charge = -+ (bfqd->bfq_max_budget * time_ms) / timeout_ms; -+ -+ if (tot_serv_to_charge < entity->service) -+ tot_serv_to_charge = entity->service; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ time_ms, timeout_ms, entity->service, -+ tot_serv_to_charge, entity->budget); -+ -+ /* Increase budget to avoid inconsistencies */ -+ if (tot_serv_to_charge > entity->budget) -+ entity->budget = tot_serv_to_charge; -+ -+ bfq_bfqq_served(bfqq, -+ max_t(int, 0, tot_serv_to_charge - entity->service)); -+} -+ -+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, -+ struct bfq_service_tree *st, -+ bool backshifted) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ /* -+ * When this function is invoked, entity is not in any service -+ * tree, then it is safe to invoke next function with the last -+ * parameter set (see the comments on the function). -+ */ -+ st = __bfq_entity_update_weight_prio(st, entity, true); -+ bfq_calc_finish(entity, entity->budget); -+ -+ /* -+ * If some queues enjoy backshifting for a while, then their -+ * (virtual) finish timestamps may happen to become lower and -+ * lower than the system virtual time. In particular, if -+ * these queues often happen to be idle for short time -+ * periods, and during such time periods other queues with -+ * higher timestamps happen to be busy, then the backshifted -+ * timestamps of the former queues can become much lower than -+ * the system virtual time. In fact, to serve the queues with -+ * higher timestamps while the ones with lower timestamps are -+ * idle, the system virtual time may be pushed-up to much -+ * higher values than the finish timestamps of the idle -+ * queues. As a consequence, the finish timestamps of all new -+ * or newly activated queues may end up being much larger than -+ * those of lucky queues with backshifted timestamps. The -+ * latter queues may then monopolize the device for a lot of -+ * time. This would simply break service guarantees. -+ * -+ * To reduce this problem, push up a little bit the -+ * backshifted timestamps of the queue associated with this -+ * entity (only a queue can happen to have the backshifted -+ * flag set): just enough to let the finish timestamp of the -+ * queue be equal to the current value of the system virtual -+ * time. This may introduce a little unfairness among queues -+ * with backshifted timestamps, but it does not break -+ * worst-case fairness guarantees. -+ * -+ * As a special case, if bfqq is weight-raised, push up -+ * timestamps much less, to keep very low the probability that -+ * this push up causes the backshifted finish timestamps of -+ * weight-raised queues to become higher than the backshifted -+ * finish timestamps of non weight-raised queues. -+ */ -+ if (backshifted && bfq_gt(st->vtime, entity->finish)) { -+ unsigned long delta = st->vtime - entity->finish; -+ -+ if (bfqq) -+ delta /= bfqq->wr_coeff; -+ -+ entity->start += delta; -+ entity->finish += delta; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: new queue finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: new group finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#endif -+ } -+ } -+ -+ bfq_active_insert(st, entity); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: queue %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: group %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#endif -+ } -+ BUG_ON(RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(&st->active != &sd->service_tree->active && -+ &st->active != &(sd->service_tree+1)->active && -+ &st->active != &(sd->service_tree+2)->active); -+} -+ -+/** -+ * __bfq_activate_entity - handle activation of entity. -+ * @entity: the entity being activated. -+ * @non_blocking_wait_rq: true if entity was waiting for a request -+ * -+ * Called for a 'true' activation, i.e., if entity is not active and -+ * one of its children receives a new request. -+ * -+ * Basically, this function updates the timestamps of entity and -+ * inserts entity into its active tree, ater possible extracting it -+ * from its idle tree. -+ */ -+static void __bfq_activate_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ bool backshifted = false; -+ unsigned long long min_vstart; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ /* See comments on bfq_fqq_update_budg_for_activation */ -+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { -+ backshifted = true; -+ min_vstart = entity->finish; -+ } else -+ min_vstart = st->vtime; -+ -+ if (entity->tree == &st->idle) { -+ /* -+ * Must be on the idle tree, bfq_idle_extract() will -+ * check for that. -+ */ -+ bfq_idle_extract(st, entity); -+ entity->start = bfq_gt(min_vstart, entity->finish) ? -+ min_vstart : entity->finish; -+ } else { -+ /* -+ * The finish time of the entity may be invalid, and -+ * it is in the past for sure, otherwise the queue -+ * would have been on the idle tree. -+ */ -+ entity->start = min_vstart; -+ st->wsum += entity->weight; -+ /* -+ * entity is about to be inserted into a service tree, -+ * and then set in service: get a reference to make -+ * sure entity does not disappear until it is no -+ * longer in service or scheduled for service. -+ */ -+ bfq_get_entity(entity); -+ -+ BUG_ON(entity->on_st && bfqq); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ if (entity->on_st && !bfqq) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, -+ bfqg, -+ "activate bug, class %d in_service %p", -+ bfq_class_idx(entity), sd->in_service_entity); -+ } -+#endif -+ BUG_ON(entity->on_st && !bfqq); -+ entity->on_st = true; -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, backshifted); -+} -+ -+/** -+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. -+ * @entity: the entity being requeued or repositioned. -+ * -+ * Requeueing is needed if this entity stops being served, which -+ * happens if a leaf descendant entity has expired. On the other hand, -+ * repositioning is needed if the next_inservice_entity for the child -+ * entity has changed. See the comments inside the function for -+ * details. -+ * -+ * Basically, this function: 1) removes entity from its active tree if -+ * present there, 2) updates the timestamps of entity and 3) inserts -+ * entity back into its active tree (in the new, right position for -+ * the new values of the timestamps). -+ */ -+static void __bfq_requeue_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree != &st->active); -+ -+ if (entity == sd->in_service_entity) { -+ /* -+ * We are requeueing the current in-service entity, -+ * which may have to be done for one of the following -+ * reasons: -+ * - entity represents the in-service queue, and the -+ * in-service queue is being requeued after an -+ * expiration; -+ * - entity represents a group, and its budget has -+ * changed because one of its child entities has -+ * just been either activated or requeued for some -+ * reason; the timestamps of the entity need then to -+ * be updated, and the entity needs to be enqueued -+ * or repositioned accordingly. -+ * -+ * In particular, before requeueing, the start time of -+ * the entity must be moved forward to account for the -+ * service that the entity has received while in -+ * service. This is done by the next instructions. The -+ * finish time will then be updated according to this -+ * new value of the start time, and to the budget of -+ * the entity. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree != &st->active); -+ /* -+ * In addition, if the entity had more than one child -+ * when set in service, then was not extracted from -+ * the active tree. This implies that the position of -+ * the entity in the active tree may need to be -+ * changed now, because we have just updated the start -+ * time of the entity, and we will update its finish -+ * time in a moment (the requeueing is then, more -+ * precisely, a repositioning in this case). To -+ * implement this repositioning, we: 1) dequeue the -+ * entity here, 2) update the finish time and -+ * requeue the entity according to the new -+ * timestamps below. -+ */ -+ if (entity->tree) -+ bfq_active_extract(st, entity); -+ } else { /* The entity is already active, and not in service */ -+ /* -+ * In this case, this function gets called only if the -+ * next_in_service entity below this entity has -+ * changed, and this change has caused the budget of -+ * this entity to change, which, finally implies that -+ * the finish time of this entity must be -+ * updated. Such an update may cause the scheduling, -+ * i.e., the position in the active tree, of this -+ * entity to change. We handle this change by: 1) -+ * dequeueing the entity here, 2) updating the finish -+ * time and requeueing the entity according to the new -+ * timestamps below. This is the same approach as the -+ * non-extracted-entity sub-case above. -+ */ -+ bfq_active_extract(st, entity); -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, false); -+} -+ -+static void __bfq_activate_requeue_entity(struct bfq_entity *entity, -+ struct bfq_sched_data *sd, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (sd->in_service_entity == entity || entity->tree == &st->active) -+ /* -+ * in service or already queued on the active tree, -+ * requeue or reposition -+ */ -+ __bfq_requeue_entity(entity); -+ else -+ /* -+ * Not in service and not queued on its active tree: -+ * the activity is idle and this is a true activation. -+ */ -+ __bfq_activate_entity(entity, non_blocking_wait_rq); -+} -+ -+ -+/** -+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. -+ * @entity: the entity to activate. -+ * @non_blocking_wait_rq: true if this entity was waiting for a request -+ * @requeue: true if this is a requeue, which implies that bfqq is -+ * being expired; thus ALL its ancestors stop being served and must -+ * therefore be requeued -+ */ -+static void bfq_activate_requeue_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq, -+ bool requeue) -+{ -+ struct bfq_sched_data *sd; -+ -+ for_each_entity(entity) { -+ BUG_ON(!entity); -+ sd = entity->sched_data; -+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); -+ -+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); -+ -+ if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ BUG_ON(!sd->next_in_service); -+ break; -+ } -+ BUG_ON(!sd->next_in_service); -+ } -+} -+ -+/** -+ * __bfq_deactivate_entity - deactivate an entity from its service tree. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: if false, the entity will not be put into the -+ * idle tree. -+ * -+ * Deactivates an entity, independently from its previous state. Must -+ * be invoked only if entity is on a service tree. Extracts the entity -+ * from that tree, and if necessary and allowed, puts it on the idle -+ * tree. -+ */ -+static bool __bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st; -+ bool is_in_service; -+ -+ if (!entity->on_st) { /* entity never activated, or already inactive */ -+ BUG_ON(sd && entity == sd->in_service_entity); -+ return false; -+ } -+ -+ /* -+ * If we get here, then entity is active, which implies that -+ * bfq_group_set_parent has already been invoked for the group -+ * represented by entity. Therefore, the field -+ * entity->sched_data has been set, and we can safely use it. -+ */ -+ st = bfq_entity_service_tree(entity); -+ is_in_service = entity == sd->in_service_entity; -+ -+ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); -+ -+ if (is_in_service) -+ bfq_calc_finish(entity, entity->service); -+ -+ if (entity->tree == &st->active) -+ bfq_active_extract(st, entity); -+ else if (!is_in_service && entity->tree == &st->idle) -+ bfq_idle_extract(st, entity); -+ else if (entity->tree) -+ BUG(); -+ -+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity, is_in_service); -+ else -+ bfq_idle_insert(st, entity); -+ -+ return true; -+} -+ -+/** -+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: true if the entity can be put on the idle tree -+ */ -+static void bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree, -+ bool expiration) -+{ -+ struct bfq_sched_data *sd; -+ struct bfq_entity *parent = NULL; -+ -+ for_each_entity_safe(entity, parent) { -+ sd = entity->sched_data; -+ -+ BUG_ON(sd == NULL); /* -+ * It would mean that this is the -+ * root group. -+ */ -+ -+ BUG_ON(expiration && entity != sd->in_service_entity); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree == -+ &bfq_entity_service_tree(entity)->active && -+ !sd->next_in_service); -+ -+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { -+ /* -+ * entity is not in any tree any more, so -+ * this deactivation is a no-op, and there is -+ * nothing to change for upper-level entities -+ * (in case of expiration, this can never -+ * happen). -+ */ -+ BUG_ON(expiration); /* -+ * entity cannot be already out of -+ * any tree -+ */ -+ return; -+ } -+ -+ if (sd->next_in_service == entity) -+ /* -+ * entity was the next_in_service entity, -+ * then, since entity has just been -+ * deactivated, a new one must be found. -+ */ -+ bfq_update_next_in_service(sd, NULL); -+ -+ if (sd->next_in_service) { -+ /* -+ * The parent entity is still backlogged, -+ * because next_in_service is not NULL. So, no -+ * further upwards deactivation must be -+ * performed. Yet, next_in_service has -+ * changed. Then the schedule does need to be -+ * updated upwards. -+ */ -+ BUG_ON(sd->next_in_service == entity); -+ break; -+ } -+ -+ /* -+ * If we get here, then the parent is no more -+ * backlogged and we need to propagate the -+ * deactivation upwards. Thus let the loop go on. -+ */ -+ -+ /* -+ * Also let parent be queued into the idle tree on -+ * deactivation, to preserve service guarantees, and -+ * assuming that who invoked this function does not -+ * need parent entities too to be removed completely. -+ */ -+ ins_into_idle_tree = true; -+ } -+ -+ /* -+ * If the deactivation loop is fully executed, then there are -+ * no more entities to touch and next loop is not executed at -+ * all. Otherwise, requeue remaining entities if they are -+ * about to stop receiving service, or reposition them if this -+ * is not the case. -+ */ -+ entity = parent; -+ for_each_entity(entity) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ /* -+ * Invoke __bfq_requeue_entity on entity, even if -+ * already active, to requeue/reposition it in the -+ * active tree (because sd->next_in_service has -+ * changed) -+ */ -+ __bfq_requeue_entity(entity); -+ -+ sd = entity->sched_data; -+ BUG_ON(expiration && sd->in_service_entity != entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "invoking udpdate_next for this queue"); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "invoking udpdate_next for this entity"); -+ } -+#endif -+ if (!bfq_update_next_in_service(sd, entity) && -+ !expiration) -+ /* -+ * next_in_service unchanged or not causing -+ * any change in entity->parent->sd, and no -+ * requeueing needed for expiration: stop -+ * here. -+ */ -+ break; -+ } -+} -+ -+/** -+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, -+ * if needed, to have at least one entity eligible. -+ * @st: the service tree to act upon. -+ * -+ * Assumes that st is not empty. -+ */ -+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); -+ -+ if (bfq_gt(root_entity->min_start, st->vtime)) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(root_entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+ } -+#endif -+ return root_entity->min_start; -+ } -+ return st->vtime; -+} -+ -+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -+{ -+ if (new_value > st->vtime) { -+ st->vtime = new_value; -+ bfq_forget_idle(st); -+ } -+} -+ -+/** -+ * bfq_first_active_entity - find the eligible entity with -+ * the smallest finish time -+ * @st: the service tree to select from. -+ * @vtime: the system virtual to use as a reference for eligibility -+ * -+ * This function searches the first schedulable entity, starting from the -+ * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path on -+ * the right is followed only if a) the left subtree contains no eligible -+ * entities and b) no eligible entity has been found yet. -+ */ -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, -+ u64 vtime) -+{ -+ struct bfq_entity *entry, *first = NULL; -+ struct rb_node *node = st->active.rb_node; -+ -+ while (node) { -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+left: -+ if (!bfq_gt(entry->start, vtime)) -+ first = entry; -+ -+ BUG_ON(bfq_gt(entry->min_start, vtime)); -+ -+ if (node->rb_left) { -+ entry = rb_entry(node->rb_left, -+ struct bfq_entity, rb_node); -+ if (!bfq_gt(entry->min_start, vtime)) { -+ node = node->rb_left; -+ goto left; -+ } -+ } -+ if (first) -+ break; -+ node = node->rb_right; -+ } -+ -+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); -+ return first; -+} -+ -+/** -+ * __bfq_lookup_next_entity - return the first eligible entity in @st. -+ * @st: the service tree. -+ * -+ * If there is no in-service entity for the sched_data st belongs to, -+ * then return the entity that will be set in service if: -+ * 1) the parent entity this st belongs to is set in service; -+ * 2) no entity belonging to such parent entity undergoes a state change -+ * that would influence the timestamps of the entity (e.g., becomes idle, -+ * becomes backlogged, changes its budget, ...). -+ * -+ * In this first case, update the virtual time in @st too (see the -+ * comments on this update inside the function). -+ * -+ * In constrast, if there is an in-service entity, then return the -+ * entity that would be set in service if not only the above -+ * conditions, but also the next one held true: the currently -+ * in-service entity, on expiration, -+ * 1) gets a finish time equal to the current one, or -+ * 2) is not eligible any more, or -+ * 3) is idle. -+ */ -+static struct bfq_entity * -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -+#if 0 -+ , bool force -+#endif -+ ) -+{ -+ struct bfq_entity *entity -+#if 0 -+ , *new_next_in_service = NULL -+#endif -+ ; -+ u64 new_vtime; -+ struct bfq_queue *bfqq; -+ -+ if (RB_EMPTY_ROOT(&st->active)) -+ return NULL; -+ -+ /* -+ * Get the value of the system virtual time for which at -+ * least one entity is eligible. -+ */ -+ new_vtime = bfq_calc_vtime_jump(st); -+ -+ /* -+ * If there is no in-service entity for the sched_data this -+ * active tree belongs to, then push the system virtual time -+ * up to the value that guarantees that at least one entity is -+ * eligible. If, instead, there is an in-service entity, then -+ * do not make any such update, because there is already an -+ * eligible entity, namely the in-service one (even if the -+ * entity is not on st, because it was extracted when set in -+ * service). -+ */ -+ if (!in_service) -+ bfq_update_vtime(st, new_vtime); -+ -+ entity = bfq_first_active_entity(st, new_vtime); -+ BUG_ON(bfq_gt(entity->start, new_vtime)); -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+ } -+#endif -+ -+ BUG_ON(!entity); -+ -+ return entity; -+} -+ -+/** -+ * bfq_lookup_next_entity - return the first eligible entity in @sd. -+ * @sd: the sched_data. -+ * -+ * This function is invoked when there has been a change in the trees -+ * for sd, and we need know what is the new next entity after this -+ * change. -+ */ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ int class_idx = 0; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ /* -+ * Choose from idle class, if needed to guarantee a minimum -+ * bandwidth to this class (and if there is some active entity -+ * in idle class). This should also mitigate -+ * priority-inversion problems in case a low priority task is -+ * holding file system resources. -+ */ -+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -+ BFQ_CL_IDLE_TIMEOUT)) { -+ if (!RB_EMPTY_ROOT(&idle_class_st->active)) -+ class_idx = BFQ_IOPRIO_CLASSES - 1; -+ /* About to be served if backlogged, or not yet backlogged */ -+ sd->bfq_class_idle_last_service = jiffies; -+ } -+ -+ /* -+ * Find the next entity to serve for the highest-priority -+ * class, unless the idle class needs to be served. -+ */ -+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ entity = __bfq_lookup_next_entity(st + class_idx, -+ sd->in_service_entity); -+ -+ if (entity) -+ break; -+ } -+ -+ BUG_ON(!entity && -+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || -+ !RB_EMPTY_ROOT(&(st+2)->active))); -+ -+ if (!entity) -+ return NULL; -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", -+ st + class_idx, class_idx); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen from st %p %d", -+ st + class_idx, class_idx); -+ } -+#endif -+ -+ return entity; -+} -+ -+static bool next_queue_may_preempt(struct bfq_data *bfqd) -+{ -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ return sd->next_in_service != sd->in_service_entity; -+} -+ -+/* -+ * Get next queue for service. -+ */ -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_sched_data *sd; -+ struct bfq_queue *bfqq; -+ -+ BUG_ON(bfqd->in_service_queue); -+ -+ if (bfqd->busy_queues == 0) -+ return NULL; -+ -+ /* -+ * Traverse the path from the root to the leaf entity to -+ * serve. Set in service all the entities visited along the -+ * way. -+ */ -+ sd = &bfqd->root_group->sched_data; -+ for (; sd ; sd = entity->my_sched_data) { -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: lookup in this group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in this group"); -+ } else { -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "get_next_queue: lookup in root group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in root group"); -+ } -+#endif -+ -+ BUG_ON(!sd->next_in_service); -+ -+ /* -+ * WARNING. We are about to set the in-service entity -+ * to sd->next_in_service, i.e., to the (cached) value -+ * returned by bfq_lookup_next_entity(sd) the last -+ * time it was invoked, i.e., the last time when the -+ * service order in sd changed as a consequence of the -+ * activation or deactivation of an entity. In this -+ * respect, if we execute bfq_lookup_next_entity(sd) -+ * in this very moment, it may, although with low -+ * probability, yield a different entity than that -+ * pointed to by sd->next_in_service. This rare event -+ * happens in case there was no CLASS_IDLE entity to -+ * serve for sd when bfq_lookup_next_entity(sd) was -+ * invoked for the last time, while there is now one -+ * such entity. -+ * -+ * If the above event happens, then the scheduling of -+ * such entity in CLASS_IDLE is postponed until the -+ * service of the sd->next_in_service entity -+ * finishes. In fact, when the latter is expired, -+ * bfq_lookup_next_entity(sd) gets called again, -+ * exactly to update sd->next_in_service. -+ */ -+ -+ /* Make next_in_service entity become in_service_entity */ -+ entity = sd->next_in_service; -+ sd->in_service_entity = entity; -+ -+ /* -+ * Reset the accumulator of the amount of service that -+ * the entity is about to receive. -+ */ -+ entity->service = 0; -+ -+ /* -+ * If entity is no longer a candidate for next -+ * service, then we extract it from its active tree, -+ * for the following reason. To further boost the -+ * throughput in some special case, BFQ needs to know -+ * which is the next candidate entity to serve, while -+ * there is already an entity in service. In this -+ * respect, to make it easy to compute/update the next -+ * candidate entity to serve after the current -+ * candidate has been set in service, there is a case -+ * where it is necessary to extract the current -+ * candidate from its service tree. Such a case is -+ * when the entity just set in service cannot be also -+ * a candidate for next service. Details about when -+ * this conditions holds are reported in the comments -+ * on the function bfq_no_longer_next_in_service() -+ * invoked below. -+ */ -+ if (bfq_no_longer_next_in_service(entity)) -+ bfq_active_extract(bfq_entity_service_tree(entity), -+ entity); -+ -+ /* -+ * For the same reason why we may have just extracted -+ * entity from its active tree, we may need to update -+ * next_in_service for the sched_data of entity too, -+ * regardless of whether entity has been extracted. -+ * In fact, even if entity has not been extracted, a -+ * descendant entity may get extracted. Such an event -+ * would cause a change in next_in_service for the -+ * level of the descendant entity, and thus possibly -+ * back to upper levels. -+ * -+ * We cannot perform the resulting needed update -+ * before the end of this loop, because, to know which -+ * is the correct next-to-serve candidate entity for -+ * each level, we need first to find the leaf entity -+ * to set in service. In fact, only after we know -+ * which is the next-to-serve leaf entity, we can -+ * discover whether the parent entity of the leaf -+ * entity becomes the next-to-serve, and so on. -+ */ -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_next_queue: this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ -+ } -+ -+ BUG_ON(!entity); -+ bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!bfqq); -+ -+ /* -+ * We can finally update all next-to-serve entities along the -+ * path from the leaf entity just set in service to the root. -+ */ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ if(!bfq_update_next_in_service(sd, NULL)) -+ break; -+ } -+ -+ return bfqq; -+} -+ -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; -+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; -+ struct bfq_entity *entity = in_serv_entity; -+ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+ -+ bfq_clear_bfqq_wait_request(in_serv_bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqd->in_service_queue = NULL; -+ -+ /* -+ * When this function is called, all in-service entities have -+ * been properly deactivated or requeued, so we can safely -+ * execute the final step: reset in_service_entity along the -+ * path from entity to the root. -+ */ -+ for_each_entity(entity) -+ entity->sched_data->in_service_entity = NULL; -+ -+ /* -+ * in_serv_entity is no longer in service, so, if it is in no -+ * service tree either, then release the service reference to -+ * the queue it represents (taken with bfq_get_entity). -+ */ -+ if (!in_serv_entity->on_st) -+ bfq_put_queue(in_serv_bfqq); -+} -+ -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool ins_into_idle_tree, bool expiration) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -+} -+ -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && -+ entity->on_st); -+ -+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -+ false); -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_requeue_entity(entity, false, -+ bfqq == bfqd->in_service_queue); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -+ -+/* -+ * Called when the bfqq no longer has requests pending, remove it from -+ * the service tree. As a special case, it can be invoked during an -+ * expiration. -+ */ -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) -+{ -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); -+ -+ bfq_clear_bfqq_busy(bfqq); -+ -+ BUG_ON(bfqd->busy_queues == 0); -+ bfqd->busy_queues--; -+ -+ if (!bfqq->dispatched) -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); -+} -+ -+/* -+ * Called when an inactive queue receives a new request. -+ */ -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); -+ -+ bfq_activate_bfqq(bfqd, bfqq); -+ -+ bfq_mark_bfqq_busy(bfqq); -+ bfqd->busy_queues++; -+ -+ if (!bfqq->dispatched) -+ if (bfqq->wr_coeff == 1) -+ bfq_weights_tree_add(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+} -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -new file mode 100644 -index 000000000000..65e7c7e77f3c ---- /dev/null -+++ b/block/bfq-sq-iosched.c -@@ -0,0 +1,5379 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include "blk.h" -+#include "bfq.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of entity to not be -+ * considered in bfq_differentiated_weights, which, in its -+ * turn, causes the scenario to be deemed wrongly symmetric in -+ * case entity's weight would have been the only weight making -+ * the scenario asymmetric. On the bright side, no unbalance -+ * will however occur when entity becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of entity). In fact, bfq_weights_tree_remove does nothing -+ * if !entity->weight_counter. -+ */ -+ if (unlikely(!entity->weight_counter)) -+ return; -+ -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); -+ -+ return dur; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_idle_window) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+ -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. -+ */ -+ bfqq->service_from_backlogged += entity->service; -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_greatest_from_now(); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_idle_window(bfqq), entity->weight); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (ref > 1 && !bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ if (bfqd->strict_guarantees) -+ return true; -+ -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable, or -+ * (b) regardless of the presence of NCQ, the device is rotational -+ * and the request pattern for bfqq is I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a) and (b) is true, and, in particular, -+ * happens to be false if bfqd is an NCQ-capable flash-based -+ * device. -+ */ -+ idling_boosts_thr = !bfqd->hw_tag || -+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -+ bfq_bfqq_idle_window(bfqq)); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There is then a case where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments on -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments on bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the return -+ * value of the function, which is true only if both the following -+ * conditions hold: -+ * 1) bfqq is sync, because idling make sense only for sync queues; -+ * 2) idling either boosts the throughput (without issues), or -+ * is necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ -+ return bfq_bfqq_sync(bfqq) && -+ (idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees); -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -+ bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ bfq_clear_bfqq_idle_window(bfqq); -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ if (!bfq_class_idle(bfqq)) -+ bfq_mark_bfqq_idle_window(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); -+} -+ -+/* -+ * Disable idle window if the process thinks too long or seeks so much that -+ * it doesn't matter. -+ */ -+static void bfq_update_idle_window(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ int enable_idle; -+ -+ /* Don't idle for async or idle io prio class. */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ enable_idle = bfq_bfqq_idle_window(bfqq); -+ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ bfqd->bfq_slice_idle == 0 || -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -+ bfqq->wr_coeff == 1)) -+ enable_idle = 0; -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -+ bfqq->wr_coeff == 1) -+ enable_idle = 0; -+ else -+ enable_idle = 1; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -+ enable_idle); -+ -+ if (enable_idle) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -+ !BFQQ_SEEKY(bfqq)) -+ bfq_update_idle_window(bfqd, bfqq, bic); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: idle_window=%d (seeky %d)", -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v8r12"; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ /* -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. -+ */ -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); -diff --git a/block/bfq.h b/block/bfq.h -new file mode 100644 -index 000000000000..f5751ea59d98 ---- /dev/null -+++ b/block/bfq.h -@@ -0,0 +1,948 @@ -+/* -+ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/blktrace_api.h> -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue on a hierarchical setup. @next_in_service -+ * points to the active entity of the sched_data service trees that -+ * will be scheduled next. It is used to reduce the number of steps -+ * needed for each hierarchical-schedule update. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the idle window before merging; taken to -+ * remember this value while the queue is merged, so as to be -+ * able to restore it in case of split. -+ */ -+ bool saved_idle_window; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ -+ struct rb_root group_weights_tree; -+ -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ -+ int busy_queues; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ -+ u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ -+ enum bfq_device_speed device_speed; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 8da66379f7ea..bf000c58644b 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 3 -+#define BLKCG_MAX_POLS 4 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - - -From 9916fed6c89c61a2b26053be04501784570bbec8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 20 Jul 2017 10:46:39 +0200 -Subject: [PATCH 02/51] Add extra checks related to entity scheduling - -- extra checks related to ioprioi-class changes -- specific check on st->idle in __bfq_requeue_entity - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index ac8991bca9fa..5ddf9af4261e 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -812,6 +812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - } - #endif - -+ BUG_ON(entity->tree && update_class_too); - BUG_ON(old_st->wsum < entity->weight); - old_st->wsum -= entity->weight; - -@@ -883,8 +884,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - - new_st->wsum += entity->weight; - -- if (new_st != old_st) -+ if (new_st != old_st) { -+ BUG_ON(!update_class_too); - entity->start = new_st->vtime; -+ } - } - - return new_st; -@@ -993,6 +996,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - * tree, then it is safe to invoke next function with the last - * parameter set (see the comments on the function). - */ -+ BUG_ON(entity->tree); - st = __bfq_entity_update_weight_prio(st, entity, true); - bfq_calc_finish(entity, entity->budget); - -@@ -1113,9 +1117,11 @@ static void __bfq_activate_entity(struct bfq_entity *entity, - * check for that. - */ - bfq_idle_extract(st, entity); -+ BUG_ON(entity->tree); - entity->start = bfq_gt(min_vstart, entity->finish) ? - min_vstart : entity->finish; - } else { -+ BUG_ON(entity->tree); - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue -@@ -1203,6 +1209,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree == &st->idle); - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child - -From 8f5b2c25dcbe31dda524e85b921b3aa1fe11d111 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 21 Jul 2017 12:08:57 +0200 -Subject: [PATCH 03/51] block, bfq: reset in_service_entity if it becomes idle - -BFQ implements hierarchical scheduling by representing each group of -queues with a generic parent entity. For each parent entity, BFQ -maintains an in_service_entity pointer: if one of the child entities -happens to be in service, in_service_entity points to it. The -resetting of these pointers happens only on queue expirations: when -the in-service queue is expired, i.e., stops to be the queue in -service, BFQ resets all in_service_entity pointers along the -parent-entity path from this queue to the root entity. - -Functions handling the scheduling of entities assume, naturally, that -in-service entities are active, i.e., have pending I/O requests (or, -as a special case, even if they have no pending requests, they are -expected to receive a new request very soon, with the scheduler idling -the storage device while waiting for such an event). Unfortunately, -the above resetting scheme of the in_service_entity pointers may cause -this assumption to be violated. For example, the in-service queue may -happen to remain without requests because of a request merge. In this -case the queue does become idle, and all related data structures are -updated accordingly. But in_service_entity still points to the queue -in the parent entity. This inconsistency may even propagate to -higher-level parent entities, if they happen to become idle as well, -as a consequence of the leaf queue becoming idle. For this queue and -parent entities, scheduling functions have an undefined behaviour, -and, as reported, may easily lead to kernel crashes or hangs. - -This commit addresses this issue by simply resetting the -in_service_entity field also when it is detected to point to an entity -becoming idle (regardless of why the entity becomes idle). - -Reported-by: Laurentiu Nicola <lnicola@dend.ro> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Laurentiu Nicola <lnicola@dend.ro> ---- - block/bfq-sched.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 5ddf9af4261e..a07a06eb5c72 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1336,8 +1336,10 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - - BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); - -- if (is_in_service) -+ if (is_in_service) { - bfq_calc_finish(entity, entity->service); -+ sd->in_service_entity = NULL; -+ } - - if (entity->tree == &st->active) - bfq_active_extract(st, entity); - -From 600ea668e2d340c95724bcf981d88812d6900342 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 28 Jul 2017 21:09:51 +0200 -Subject: [PATCH 04/51] block, bfq: consider also in_service_entity to state - whether an entity is active - -Groups of BFQ queues are represented by generic entities in BFQ. When -a queue belonging to a parent entity is deactivated, the parent entity -may need to be deactivated too, in case the deactivated queue was the -only active queue for the parent entity. This deactivation may need to -be propagated upwards if the entity belongs, in its turn, to a further -higher-level entity, and so on. In particular, the upward propagation -of deactivation stops at the first parent entity that remains active -even if one of its child entities has been deactivated. - -To decide whether the last non-deactivation condition holds for a -parent entity, BFQ checks whether the field next_in_service is still -not NULL for the parent entity, after the deactivation of one of its -child entity. If it is not NULL, then there are certainly other active -entities in the parent entity, and deactivations can stop. - -Unfortunately, this check misses a corner case: if in_service_entity -is not NULL, then next_in_service may happen to be NULL, although the -parent entity is evidently active. This happens if: 1) the entity -pointed by in_service_entity is the only active entity in the parent -entity, and 2) according to the definition of next_in_service, the -in_service_entity cannot be considered as next_in_service. See the -comments on the definition of next_in_service for details on this -second point. - -Hitting the above corner case causes crashes. - -To address this issue, this commit: -1) Extends the above check on only next_in_service to controlling both -next_in_service and in_service_entity (if any of them is not NULL, -then no further deactivation is performed) -2) Improves the (important) comments on how next_in_service is defined -and updated; in particular it fixes a few rather obscure paragraphs - -Reported-by: Eric Wheeler <bfq-sched@lists.ewheeler.net> -Reported-by: Rick Yiu <rick_yiu@htc.com> -Reported-by: Tom X Nguyen <tom81094@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Eric Wheeler <bfq-sched@lists.ewheeler.net> -Tested-by: Rick Yiu <rick_yiu@htc.com> -Tested-by: Laurentiu Nicola <lnicola@dend.ro> -Tested-by: Tom X Nguyen <tom81094@gmail.com> ---- - block/bfq-sched.c | 140 ++++++++++++++++++++++++++++++------------------------ - block/bfq.h | 23 +++++++-- - 2 files changed, 95 insertions(+), 68 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a07a06eb5c72..5c0f9290a79c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -196,21 +196,23 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) - - /* - * This function tells whether entity stops being a candidate for next -- * service, according to the following logic. -+ * service, according to the restrictive definition of the field -+ * next_in_service. In particular, this function is invoked for an -+ * entity that is about to be set in service. - * -- * This function is invoked for an entity that is about to be set in -- * service. If such an entity is a queue, then the entity is no longer -- * a candidate for next service (i.e, a candidate entity to serve -- * after the in-service entity is expired). The function then returns -- * true. -+ * If entity is a queue, then the entity is no longer a candidate for -+ * next service according to the that definition, because entity is -+ * about to become the in-service queue. This function then returns -+ * true if entity is a queue. - * -- * In contrast, the entity could stil be a candidate for next service -- * if it is not a queue, and has more than one child. In fact, even if -- * one of its children is about to be set in service, other children -- * may still be the next to serve. As a consequence, a non-queue -- * entity is not a candidate for next-service only if it has only one -- * child. And only if this condition holds, then the function returns -- * true for a non-queue entity. -+ * In contrast, entity could still be a candidate for next service if -+ * it is not a queue, and has more than one active child. In fact, -+ * even if one of its children is about to be set in service, other -+ * active children may still be the next to serve, for the parent -+ * entity, even according to the above definition. As a consequence, a -+ * non-queue entity is not a candidate for next-service only if it has -+ * only one active child. And only if this condition holds, then this -+ * function returns true for a non-queue entity. - */ - static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - { -@@ -223,6 +225,18 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - - BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); - BUG_ON(bfqg->active_entities == 0); -+ /* -+ * The field active_entities does not always contain the -+ * actual number of active children entities: it happens to -+ * not account for the in-service entity in case the latter is -+ * removed from its active tree (which may get done after -+ * invoking the function bfq_no_longer_next_in_service in -+ * bfq_get_next_queue). Fortunately, here, i.e., while -+ * bfq_no_longer_next_in_service is not yet completed in -+ * bfq_get_next_queue, bfq_active_extract has not yet been -+ * invoked, and thus active_entities still coincides with the -+ * actual number of active entities. -+ */ - if (bfqg->active_entities == 1) - return true; - -@@ -1089,7 +1103,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - * one of its children receives a new request. - * - * Basically, this function updates the timestamps of entity and -- * inserts entity into its active tree, ater possible extracting it -+ * inserts entity into its active tree, ater possibly extracting it - * from its idle tree. - */ - static void __bfq_activate_entity(struct bfq_entity *entity, -@@ -1213,7 +1227,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child -- * when set in service, then was not extracted from -+ * when set in service, then it was not extracted from - * the active tree. This implies that the position of - * the entity in the active tree may need to be - * changed now, because we have just updated the start -@@ -1221,9 +1235,8 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - * time in a moment (the requeueing is then, more - * precisely, a repositioning in this case). To - * implement this repositioning, we: 1) dequeue the -- * entity here, 2) update the finish time and -- * requeue the entity according to the new -- * timestamps below. -+ * entity here, 2) update the finish time and requeue -+ * the entity according to the new timestamps below. - */ - if (entity->tree) - bfq_active_extract(st, entity); -@@ -1270,9 +1283,9 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - - - /** -- * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -- * and activate, requeue or reposition all ancestors -- * for which such an update becomes necessary. -+ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. - * @entity: the entity to activate. - * @non_blocking_wait_rq: true if this entity was waiting for a request - * @requeue: true if this is a requeue, which implies that bfqq is -@@ -1308,9 +1321,9 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, - * @ins_into_idle_tree: if false, the entity will not be put into the - * idle tree. - * -- * Deactivates an entity, independently from its previous state. Must -+ * Deactivates an entity, independently of its previous state. Must - * be invoked only if entity is on a service tree. Extracts the entity -- * from that tree, and if necessary and allowed, puts it on the idle -+ * from that tree, and if necessary and allowed, puts it into the idle - * tree. - */ - static bool __bfq_deactivate_entity(struct bfq_entity *entity, -@@ -1359,7 +1372,7 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - /** - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. -- * @ins_into_idle_tree: true if the entity can be put on the idle tree -+ * @ins_into_idle_tree: true if the entity can be put into the idle tree - */ - static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, -@@ -1406,16 +1419,29 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - */ - bfq_update_next_in_service(sd, NULL); - -- if (sd->next_in_service) { -+ if (sd->next_in_service || sd->in_service_entity) { - /* -- * The parent entity is still backlogged, -- * because next_in_service is not NULL. So, no -- * further upwards deactivation must be -- * performed. Yet, next_in_service has -- * changed. Then the schedule does need to be -- * updated upwards. -+ * The parent entity is still active, because -+ * either next_in_service or in_service_entity -+ * is not NULL. So, no further upwards -+ * deactivation must be performed. Yet, -+ * next_in_service has changed. Then the -+ * schedule does need to be updated upwards. -+ * -+ * NOTE If in_service_entity is not NULL, then -+ * next_in_service may happen to be NULL, -+ * although the parent entity is evidently -+ * active. This happens if 1) the entity -+ * pointed by in_service_entity is the only -+ * active entity in the parent entity, and 2) -+ * according to the definition of -+ * next_in_service, the in_service_entity -+ * cannot be considered as -+ * next_in_service. See the comments on the -+ * definition of next_in_service for details. - */ - BUG_ON(sd->next_in_service == entity); -+ BUG_ON(sd->in_service_entity == entity); - break; - } - -@@ -1806,45 +1832,33 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - - /* - * If entity is no longer a candidate for next -- * service, then we extract it from its active tree, -- * for the following reason. To further boost the -- * throughput in some special case, BFQ needs to know -- * which is the next candidate entity to serve, while -- * there is already an entity in service. In this -- * respect, to make it easy to compute/update the next -- * candidate entity to serve after the current -- * candidate has been set in service, there is a case -- * where it is necessary to extract the current -- * candidate from its service tree. Such a case is -- * when the entity just set in service cannot be also -- * a candidate for next service. Details about when -- * this conditions holds are reported in the comments -- * on the function bfq_no_longer_next_in_service() -- * invoked below. -+ * service, then it must be extracted from its active -+ * tree, so as to make sure that it won't be -+ * considered when computing next_in_service. See the -+ * comments on the function -+ * bfq_no_longer_next_in_service() for details. - */ - if (bfq_no_longer_next_in_service(entity)) - bfq_active_extract(bfq_entity_service_tree(entity), - entity); - - /* -- * For the same reason why we may have just extracted -- * entity from its active tree, we may need to update -- * next_in_service for the sched_data of entity too, -- * regardless of whether entity has been extracted. -- * In fact, even if entity has not been extracted, a -- * descendant entity may get extracted. Such an event -- * would cause a change in next_in_service for the -- * level of the descendant entity, and thus possibly -- * back to upper levels. -+ * Even if entity is not to be extracted according to -+ * the above check, a descendant entity may get -+ * extracted in one of the next iterations of this -+ * loop. Such an event could cause a change in -+ * next_in_service for the level of the descendant -+ * entity, and thus possibly back to this level. - * -- * We cannot perform the resulting needed update -- * before the end of this loop, because, to know which -- * is the correct next-to-serve candidate entity for -- * each level, we need first to find the leaf entity -- * to set in service. In fact, only after we know -- * which is the next-to-serve leaf entity, we can -- * discover whether the parent entity of the leaf -- * entity becomes the next-to-serve, and so on. -+ * However, we cannot perform the resulting needed -+ * update of next_in_service for this level before the -+ * end of the whole loop, because, to know which is -+ * the correct next-to-serve candidate entity for each -+ * level, we need first to find the leaf entity to set -+ * in service. In fact, only after we know which is -+ * the next-to-serve leaf entity, we can discover -+ * whether the parent entity of the leaf entity -+ * becomes the next-to-serve, and so on. - */ - - /* Log some information */ -diff --git a/block/bfq.h b/block/bfq.h -index f5751ea59d98..ebd9688b9f61 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -68,17 +68,30 @@ struct bfq_service_tree { - * - * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as an -- * intermediate queue on a hierarchical setup. @next_in_service -- * points to the active entity of the sched_data service trees that -- * will be scheduled next. It is used to reduce the number of steps -- * needed for each hierarchical-schedule update. -+ * intermediate queue in a hierarchical setup. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. - * Requests from higher priority queues are served before all the - * requests from lower priority queues; among requests of the same - * queue requests are served according to B-WF2Q+. -- * All the fields are protected by the queue lock of the containing bfqd. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. - */ - struct bfq_sched_data { - struct bfq_entity *in_service_entity; /* entity in service */ - -From 6b5effd10bc6711a862e7cbd7cd2dd0146defa01 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 4 May 2017 10:53:43 +0200 -Subject: [PATCH 05/51] block, bfq: improve and refactor throughput-boosting - logic - -When a queue associated with a process remains empty, there are cases -where throughput gets boosted if the device is idled to await the -arrival of a new I/O request for that queue. Currently, BFQ assumes -that one of these cases is when the device has no internal queueing -(regardless of the properties of the I/O being served). Unfortunately, -this condition has proved to be too general. So, this commit refines it -as "the device has no internal queueing and is rotational". - -This refinement provides a significant throughput boost with random -I/O, on flash-based storage without internal queueing. For example, on -a HiKey board, throughput increases by up to 125%, growing, e.g., from -6.9MB/s to 15.6MB/s with two or three random readers in parallel. - -This commit also refactors the code related to device idling, for the -following reason. Finding the change that provides the above large -improvement has been slightly more difficult than it had to be, -because the logic that decides whether to idle the device is still -scattered across three functions. Almost all of the logic is in the -function bfq_bfqq_may_idle, but (1) part of the decision is made in -bfq_update_idle_window, and (2) the function bfq_bfqq_must_idle may -switch off idling regardless of the output of bfq_bfqq_may_idle. In -addition, both bfq_update_idle_window and bfq_bfqq_must_idle make -their decisions as a function of parameters that are used, for similar -purposes, also in bfq_bfqq_may_idle. This commit addresses this issue -by moving all the logic into bfq_bfqq_may_idle. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - block/bfq-sq-iosched.c | 141 +++++++++++++++++++++++++++---------------------- - block/bfq.h | 12 ++--- - 2 files changed, 83 insertions(+), 70 deletions(-) - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 65e7c7e77f3c..30d019fc67e0 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -684,10 +684,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - unsigned int old_wr_coeff; - bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - -- if (bic->saved_idle_window) -- bfq_mark_bfqq_idle_window(bfqq); -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); - else -- bfq_clear_bfqq_idle_window(bfqq); -+ bfq_clear_bfqq_has_short_ttime(bfqq); - - if (bic->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); -@@ -2047,7 +2047,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - if (!bic) - return; - -- bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -@@ -3214,9 +3214,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", - reason, slow, bfqq->dispatched, -- bfq_bfqq_idle_window(bfqq), entity->weight); -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* - * Increase, decrease or leave budget unchanged according to -@@ -3298,7 +3298,10 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - { - struct bfq_data *bfqd = bfqq->bfqd; -- bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr, idling_boosts_thr_without_issues, - idling_needed_for_service_guarantees, - asymmetric_scenario; - -@@ -3306,27 +3309,44 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - return true; - - /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* - * The next variable takes into account the cases where idling - * boosts the throughput. - * - * The value of the variable is computed considering, first, that - * idling is virtually always beneficial for the throughput if: -- * (a) the device is not NCQ-capable, or -- * (b) regardless of the presence of NCQ, the device is rotational -- * and the request pattern for bfqq is I/O-bound and sequential. -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. - * - * Secondly, and in contrast to the above item (b), idling an - * NCQ-capable flash-based device would not boost the - * throughput even with sequential I/O; rather it would lower - * the throughput in proportion to how fast the device - * is. Accordingly, the next variable is true if any of the -- * above conditions (a) and (b) is true, and, in particular, -- * happens to be false if bfqd is an NCQ-capable flash-based -- * device. -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. - */ -- idling_boosts_thr = !bfqd->hw_tag || -- (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -- bfq_bfqq_idle_window(bfqq)); -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); - - /* - * The value of the next variable, -@@ -3497,12 +3517,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); - - /* -- * We have now all the components we need to compute the return -- * value of the function, which is true only if both the following -- * conditions hold: -- * 1) bfqq is sync, because idling make sense only for sync queues; -- * 2) idling either boosts the throughput (without issues), or -- * is necessary to preserve service guarantees. -+ * We have now all the components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. - */ - bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); -@@ -3514,9 +3532,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - bfq_bfqq_IO_bound(bfqq), - idling_needed_for_service_guarantees); - -- return bfq_bfqq_sync(bfqq) && -- (idling_boosts_thr_without_issues || -- idling_needed_for_service_guarantees); -+ return idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees; - } - - /* -@@ -3532,10 +3549,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - */ - static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = bfqq->bfqd; -- -- return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -- bfq_bfqq_may_idle(bfqq); -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); - } - - /* -@@ -3994,7 +4008,6 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - case IOPRIO_CLASS_IDLE: - bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; -- bfq_clear_bfqq_idle_window(bfqq); - break; - } - -@@ -4058,8 +4071,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_set_next_ioprio_data(bfqq, bic); - - if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ - if (!bfq_class_idle(bfqq)) -- bfq_mark_bfqq_idle_window(bfqq); -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); - bfq_mark_bfqq_sync(bfqq); - bfq_mark_bfqq_just_created(bfqq); - } else -@@ -4195,18 +4214,19 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, - blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); - } - --/* -- * Disable idle window if the process thinks too long or seeks so much that -- * it doesn't matter. -- */ --static void bfq_update_idle_window(struct bfq_data *bfqd, -- struct bfq_queue *bfqq, -- struct bfq_io_cq *bic) -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) - { -- int enable_idle; -+ bool has_short_ttime = true; - -- /* Don't idle for async or idle io prio class. */ -- if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) - return; - - /* Idle window just restored, statistics are meaningless. */ -@@ -4214,27 +4234,22 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, - bfqd->bfq_wr_min_idle_time)) - return; - -- enable_idle = bfq_bfqq_idle_window(bfqq); -- -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -- bfqd->bfq_slice_idle == 0 || -- (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -- bfqq->wr_coeff == 1)) -- enable_idle = 0; -- else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -- if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -- bfqq->wr_coeff == 1) -- enable_idle = 0; -- else -- enable_idle = 1; -- } -- bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -- enable_idle); -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ has_short_ttime); - -- if (enable_idle) -- bfq_mark_bfqq_idle_window(bfqq); -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); - else -- bfq_clear_bfqq_idle_window(bfqq); -+ bfq_clear_bfqq_has_short_ttime(bfqq); - } - - /* -@@ -4250,14 +4265,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->meta_pending++; - - bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); -- if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -- !BFQQ_SEEKY(bfqq)) -- bfq_update_idle_window(bfqd, bfqq, bic); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: idle_window=%d (seeky %d)", -- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); -+ "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - -diff --git a/block/bfq.h b/block/bfq.h -index ebd9688b9f61..34fc4697fd89 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -349,11 +349,11 @@ struct bfq_io_cq { - #endif - - /* -- * Snapshot of the idle window before merging; taken to -- * remember this value while the queue is merged, so as to be -- * able to restore it in case of split. -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. - */ -- bool saved_idle_window; -+ bool saved_has_short_ttime; - /* - * Same purpose as the previous two fields for the I/O bound - * classification of a queue. -@@ -610,7 +610,7 @@ enum bfqq_state_flags { - */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -- BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_IO_bound, /* - * bfqq has timed-out at least once -@@ -649,7 +649,7 @@ BFQ_BFQQ_FNS(wait_request); - BFQ_BFQQ_FNS(non_blocking_wait_rq); - BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); --BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(has_short_ttime); - BFQ_BFQQ_FNS(sync); - BFQ_BFQQ_FNS(IO_bound); - BFQ_BFQQ_FNS(in_large_burst); - -From b5e746fa99d961a5642cffb27c19a77e8b638007 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 19 Dec 2016 16:59:33 +0100 -Subject: [PATCH 06/51] FIRST BFQ-MQ COMMIT: Copy bfq-sq-iosched.c as - bfq-mq-iosched.c - -This commit introduces bfq-mq-iosched.c, the main source file that -will contain the code of bfq for blk-mq. I name tentatively -bfq-mq this version of bfq. - -For the moment, the file bfq-mq-iosched.c is just a copy of -bfq-sq-iosched.c, i.e, of the main source file of bfq for blk. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 5392 ++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 5392 insertions(+) - create mode 100644 block/bfq-mq-iosched.c - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -new file mode 100644 -index 000000000000..30d019fc67e0 ---- /dev/null -+++ b/block/bfq-mq-iosched.c -@@ -0,0 +1,5392 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include "blk.h" -+#include "bfq.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of entity to not be -+ * considered in bfq_differentiated_weights, which, in its -+ * turn, causes the scenario to be deemed wrongly symmetric in -+ * case entity's weight would have been the only weight making -+ * the scenario asymmetric. On the bright side, no unbalance -+ * will however occur when entity becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of entity). In fact, bfq_weights_tree_remove does nothing -+ * if !entity->weight_counter. -+ */ -+ if (unlikely(!entity->weight_counter)) -+ return; -+ -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); -+ -+ return dur; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+ -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. -+ */ -+ bfqq->service_from_backlogged += entity->service; -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_greatest_from_now(); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (ref > 1 && !bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr, idling_boosts_thr_without_issues, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ if (bfqd->strict_guarantees) -+ return true; -+ -+ /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. -+ */ -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There is then a case where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments on -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments on bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ -+ return idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees; -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ -+ if (!bfq_class_idle(bfqq)) -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); -+} -+ -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ bool has_short_ttime = true; -+ -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ has_short_ttime); -+ -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v8r12"; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ /* -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. -+ */ -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); - -From e24d2e6461479dbd13d58be2dc44b23b5e24487c Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 19 Dec 2016 17:13:39 +0100 -Subject: [PATCH 07/51] Add config and build bits for bfq-mq-iosched - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/Kconfig.iosched | 10 +++++++++ - block/Makefile | 1 + - block/bfq-cgroup-included.c | 4 ++-- - block/bfq-mq-iosched.c | 25 ++++++++++++----------- - block/bfq-sched.c | 50 ++++++++++++++++++++++----------------------- - block/bfq-sq-iosched.c | 24 +++++++++++----------- - block/bfq.h | 36 +++++++++++++++++++++----------- - 8 files changed, 88 insertions(+), 64 deletions(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 9e3f4c2f7390..2d94af3d8b0a 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -96,6 +96,16 @@ config DEFAULT_IOSCHED - default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - -+config MQ_IOSCHED_BFQ -+ tristate "BFQ-MQ I/O Scheduler" -+ default y -+ ---help--- -+ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth -+ among all processes according to their weights, regardless of -+ the device parameters and with any workload. It also -+ guarantees a low latency to interactive and soft real-time -+ applications. Details in Documentation/block/bfq-iosched.txt -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/Makefile b/block/Makefile -index 59026b425791..a571329c23f0 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -25,6 +25,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o - obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o -+obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index af7c216a3540..9c483b658179 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -15,7 +15,7 @@ - * file. - */ - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - - /* bfqg stats flags */ - enum bfqg_stats_flags { -@@ -1116,7 +1116,7 @@ static struct cftype bfq_blkg_files[] = { - {} /* terminate */ - }; - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, unsigned int op) { } -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 30d019fc67e0..e88e00f1e0a7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -82,6 +82,7 @@ - #include <linux/rbtree.h> - #include <linux/ioprio.h> - #include "blk.h" -+#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ -@@ -387,7 +388,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || -@@ -1672,7 +1673,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - } - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -@@ -3879,7 +3880,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -@@ -3909,7 +3910,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_put(bfqg); - #endif - } -@@ -4835,7 +4836,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); -@@ -4850,7 +4851,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - { - int i; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -@@ -5265,7 +5266,7 @@ static struct elevator_type iosched_bfq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -@@ -5292,7 +5293,7 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, -@@ -5315,7 +5316,7 @@ static int __init bfq_init(void) - int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -@@ -5362,7 +5363,7 @@ static int __init bfq_init(void) - if (ret) - goto err_pol_unreg; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); - #endif - pr_info("%s", msg); -@@ -5370,7 +5371,7 @@ static int __init bfq_init(void) - return 0; - - err_pol_unreg: --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - return ret; -@@ -5379,7 +5380,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 5c0f9290a79c..b54a638186e3 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -136,7 +136,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_next_in_service: chosen this queue"); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(next_in_service, -@@ -149,7 +149,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - return parent_sched_may_change; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* both next loops stop at one of the child entities of the root group */ - #define for_each_entity(entity) \ - for (; entity ; entity = entity->parent) -@@ -243,7 +243,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - return false; - } - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - #define for_each_entity(entity) \ - for (; entity ; entity = NULL) - -@@ -260,7 +260,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - return true; - } - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - /* - * Shift for timestamp calculations. This actually limits the maximum -@@ -323,7 +323,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", - start, finish, delta); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -473,7 +473,7 @@ static void bfq_update_active_node(struct rb_node *node) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_active_node: new min_start %llu", - ((entity->min_start>>10)*1000)>>12); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -540,7 +540,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node = &entity->rb_node; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -@@ -555,7 +555,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - - bfq_update_active_tree(node); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); -@@ -563,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - #endif - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -@@ -652,7 +652,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -@@ -664,7 +664,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - if (node) - bfq_update_active_tree(node); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); -@@ -672,7 +672,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - #endif - if (bfqq) - list_del(&bfqq->bfqq_list); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_remove(bfqd, entity, -@@ -809,14 +809,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root *root; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; - #endif - - if (bfqq) - bfqd = bfqq->bfqd; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); -@@ -907,7 +907,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - return new_st; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); - #endif - -@@ -936,7 +936,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif - st = bfq_entity_service_tree(&bfqq->entity); -@@ -1060,7 +1060,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1078,7 +1078,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1153,7 +1153,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity, - - BUG_ON(entity->on_st && bfqq); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - if (entity->on_st && !bfqq) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, -@@ -1485,7 +1485,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "invoking udpdate_next for this queue"); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, -@@ -1525,7 +1525,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", - root_entity->min_start); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(root_entity, struct bfq_group, -@@ -1661,7 +1661,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - "__lookup_next: start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1735,7 +1735,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", - st + class_idx, class_idx); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1777,7 +1777,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - */ - sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - if (entity) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1867,7 +1867,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, - "get_next_queue: this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 30d019fc67e0..25da0d1c0622 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -387,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || -@@ -1672,7 +1672,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - } - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -@@ -3879,7 +3879,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -@@ -3909,7 +3909,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_put(bfqg); - #endif - } -@@ -4835,7 +4835,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); -@@ -4850,7 +4850,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - { - int i; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -@@ -5265,7 +5265,7 @@ static struct elevator_type iosched_bfq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -@@ -5292,7 +5292,7 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, -@@ -5315,7 +5315,7 @@ static int __init bfq_init(void) - int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -@@ -5362,7 +5362,7 @@ static int __init bfq_init(void) - if (ret) - goto err_pol_unreg; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); - #endif - pr_info("%s", msg); -@@ -5370,7 +5370,7 @@ static int __init bfq_init(void) - return 0; - - err_pol_unreg: --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - return ret; -@@ -5379,7 +5379,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -diff --git a/block/bfq.h b/block/bfq.h -index 34fc4697fd89..53954d1b87f8 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -19,6 +19,18 @@ - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ - #define BFQ_IOPRIO_CLASSES 3 - #define BFQ_CL_IDLE_TIMEOUT (HZ/5) - -@@ -344,7 +356,7 @@ struct bfq_io_cq { - struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ - int ioprio; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -@@ -671,7 +683,7 @@ static const char *checked_dev_name(const struct device *dev) - return nodev; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -@@ -696,7 +708,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - __pbuf, ##args); \ - } while (0) - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - pr_crit("%s bfq%d%c " fmt "\n", \ -@@ -705,7 +717,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ - pr_crit("%s bfq " fmt "\n", \ -@@ -713,7 +725,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -@@ -735,7 +747,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ - } while (0) - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -@@ -743,7 +755,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -@@ -763,7 +775,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -794,7 +806,7 @@ struct bfqg_stats { - #endif - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * -@@ -895,7 +907,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "entity_service_tree %p %d", - sched_data->service_tree + idx, idx); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -924,7 +936,7 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) - return bic->icq.q->elevator->elevator_data; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - - static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) - { -@@ -953,7 +965,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bfq_io_cq *bic); - static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); - #endif - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); - -From add91dbd756cf8ca3aa3add9a19eef742d5fca6b Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 20 Jan 2017 09:18:25 +0100 -Subject: [PATCH 08/51] Increase max policies for io controller - -To let bfq-mq policy be plugged too (however cgroups -suppport is not yet functional in bfq-mq). - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - include/linux/blkdev.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index bf000c58644b..10f892ca585d 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 4 -+#define BLKCG_MAX_POLS 5 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - - -From 2c39a1d9ab4516d44e01e96f19f578b927e7f2e9 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 19 Dec 2016 18:11:33 +0100 -Subject: [PATCH 09/51] Copy header file bfq.h as bfq-mq.h - -This commit introduces the header file bfq-mq.h, that will play -for bfq-mq-iosched.c the same role that bfq.h plays for bfq-iosched.c. - -For the moment, the file bfq-mq.h is just a copy of bfq.h. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-mq.h | 973 +++++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 974 insertions(+), 1 deletion(-) - create mode 100644 block/bfq-mq.h - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index e88e00f1e0a7..d1125aee658c 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -83,7 +83,7 @@ - #include <linux/ioprio.h> - #include "blk.h" - #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ --#include "bfq.h" -+#include "bfq-mq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -new file mode 100644 -index 000000000000..53954d1b87f8 ---- /dev/null -+++ b/block/bfq-mq.h -@@ -0,0 +1,973 @@ -+/* -+ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/blktrace_api.h> -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue in a hierarchical setup. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. -+ */ -+ bool saved_has_short_ttime; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ -+ struct rb_root group_weights_tree; -+ -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ -+ int busy_queues; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ -+ u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ -+ enum bfq_device_speed device_speed; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(has_short_ttime); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ - -From 0bd96428e086fd28800efdf5f0a5f62869af6e30 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 21 Jan 2017 12:41:14 +0100 -Subject: [PATCH 10/51] Move thinktime from bic to bfqq - -Prep change to make it possible to protect this field with a -scheduler lock. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 28 ++++++++++++++-------------- - block/bfq-mq.h | 30 ++++++++++++++++-------------- - 2 files changed, 30 insertions(+), 28 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index d1125aee658c..65f5dfb79417 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -698,6 +698,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - if (unlikely(busy)) - old_wr_coeff = bfqq->wr_coeff; - -+ bfqq->ttime = bic->saved_ttime; - bfqq->wr_coeff = bic->saved_wr_coeff; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -@@ -1287,7 +1288,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - * details on the usage of the next variable. - */ - arrived_in_time = ktime_get_ns() <= -- RQ_BIC(rq)->ttime.last_end_request + -+ bfqq->ttime.last_end_request + - bfqd->bfq_slice_idle * 3; - - bfq_log_bfqq(bfqd, bfqq, -@@ -2048,6 +2049,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - if (!bic) - return; - -+ bic->saved_ttime = bfqq->ttime; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -@@ -3948,11 +3950,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_put_queue(bfqq); /* release process reference */ - } - --static void bfq_init_icq(struct io_cq *icq) --{ -- icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); --} -- - static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); -@@ -4084,6 +4081,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_mark_bfqq_just_created(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); -+ -+ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+ - bfq_mark_bfqq_IO_bound(bfqq); - - /* Tentative initial value to trade off between thr and lat */ -@@ -4191,14 +4191,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - } - - static void bfq_update_io_thinktime(struct bfq_data *bfqd, -- struct bfq_io_cq *bic) -+ struct bfq_queue *bfqq) - { -- struct bfq_ttime *ttime = &bic->ttime; -- u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ struct bfq_ttime *ttime = &bfqq->ttime; -+ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; - - elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); - -- ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; - ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); - ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, - ttime->ttime_samples); -@@ -4240,8 +4240,8 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - * decide whether to mark as has_short_ttime - */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -- (bfq_sample_valid(bic->ttime.ttime_samples) && -- bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ (bfq_sample_valid(bfqq->ttime.ttime_samples) && -+ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - - bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -@@ -4265,7 +4265,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (rq->cmd_flags & REQ_META) - bfqq->meta_pending++; - -- bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_thinktime(bfqd, bfqq); - bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); - -@@ -4436,7 +4436,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - - now_ns = ktime_get_ns(); - -- RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ bfqq->ttime.last_end_request = now_ns; - - /* - * Using us instead of ns, to get a reasonable precision in -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 53954d1b87f8..0f51f270469c 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -210,6 +210,18 @@ struct bfq_entity { - struct bfq_group; - - /** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** - * struct bfq_queue - leaf schedulable entity. - * - * A bfq_queue is a leaf request queue; it can be associated with an -@@ -270,6 +282,9 @@ struct bfq_queue { - /* node for active/idle bfqq list inside parent bfqd */ - struct list_head bfqq_list; - -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ - /* bit vector: a 1 for each seeky requests in history */ - u32 seek_history; - -@@ -333,18 +348,6 @@ struct bfq_queue { - }; - - /** -- * struct bfq_ttime - per process thinktime stats. -- */ --struct bfq_ttime { -- u64 last_end_request; /* completion time of last request */ -- -- u64 ttime_total; /* total process thinktime */ -- unsigned long ttime_samples; /* number of thinktime samples */ -- u64 ttime_mean; /* average process thinktime */ -- --}; -- --/** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ - struct bfq_io_cq { -@@ -352,8 +355,6 @@ struct bfq_io_cq { - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; -- /* associated @bfq_ttime struct */ -- struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ - int ioprio; - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -390,6 +391,7 @@ struct bfq_io_cq { - unsigned long saved_last_wr_start_finish; - unsigned long saved_wr_start_at_switch_to_srt; - unsigned int saved_wr_cur_max_time; -+ struct bfq_ttime saved_ttime; - }; - - enum bfq_device_speed { - -From 351a9aea7c0c9c30edacdbf2a3c0d089470de1e8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 18 Jan 2017 11:42:22 +0100 -Subject: [PATCH 11/51] Embed bfq-ioc.c and add locking on request queue - -The version of bfq-ioc.c for bfq-iosched.c is not correct any more for -bfq-mq, because, in bfq-mq, the request queue lock is not being held -when bfq_bic_lookup is invoked. That function must then take that look -on its own. This commit removes the inclusion of bfq-ioc.c, copies the -content of bfq-ioc.c into bfq-mq-iosched.c, and adds the grabbing of -the lock. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 39 ++++++++++++++++++++++++++++++++++++--- - 1 file changed, 36 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 65f5dfb79417..756a618d5902 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -195,7 +195,39 @@ static int device_speed_thresh[2]; - - static void bfq_schedule_dispatch(struct bfq_data *bfqd); - --#include "bfq-ioc.c" -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * @q: the request queue. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc, -+ struct request_queue *q) -+{ -+ if (ioc) { -+ struct bfq_io_cq *icq; -+ -+ spin_lock_irq(q->queue_lock); -+ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -+ spin_unlock_irq(q->queue_lock); -+ -+ return icq; -+ } -+ -+ return NULL; -+} -+ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" - -@@ -1520,13 +1552,14 @@ static void bfq_add_request(struct request *rq) - } - - static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -- struct bio *bio) -+ struct bio *bio, -+ struct request_queue *q) - { - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - -- bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ bic = bfq_bic_lookup(bfqd, tsk->io_context, q); - if (!bic) - return NULL; - - -From ed0d64e27b2308813a2a846139e405e0479f0849 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 20 Dec 2016 09:07:19 +0100 -Subject: [PATCH 12/51] Modify interface and operation to comply with - blk-mq-sched - -As for modifications of the operation, the major changes are the introduction -of a scheduler lock, and the moving to deferred work of the body of the hook -exit_icq. The latter change has been made to avoid deadlocks caused by the -combination of the following facts: 1) such a body takes the scheduler lock, -and, if not deferred, 2) it does so from inside the exit_icq hook, which is -invoked with the queue lock held, and 3) there is at least one code path, -namely that starting from bfq_bio_merge, which takes these locks in the -opposite order. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 4 - - block/bfq-mq-iosched.c | 695 ++++++++++++++++++++++++-------------------- - block/bfq-mq.h | 35 +-- - 3 files changed, 394 insertions(+), 340 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 9c483b658179..8a73de76f32b 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -472,8 +472,6 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct bfq_group *bfqg, *parent; - struct bfq_entity *entity; - -- assert_spin_locked(bfqd->queue->queue_lock); -- - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - - if (unlikely(!bfqg)) -@@ -602,8 +600,6 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_group *bfqg; - struct bfq_entity *entity; - -- lockdep_assert_held(bfqd->queue->queue_lock); -- - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 756a618d5902..c963d92a32c2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -81,7 +81,13 @@ - #include <linux/jiffies.h> - #include <linux/rbtree.h> - #include <linux/ioprio.h> -+#include <linux/sbitmap.h> -+#include <linux/delay.h> -+ - #include "blk.h" -+#include "blk-mq.h" -+#include "blk-mq-tag.h" -+#include "blk-mq-sched.h" - #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq-mq.h" - -@@ -193,8 +199,6 @@ static int device_speed_thresh[2]; - #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - --static void bfq_schedule_dispatch(struct bfq_data *bfqd); -- - /** - * icq_to_bic - convert iocontext queue structure to bfq_io_cq. - * @icq: the iocontext queue. -@@ -216,11 +220,12 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct request_queue *q) - { - if (ioc) { -+ unsigned long flags; - struct bfq_io_cq *icq; - -- spin_lock_irq(q->queue_lock); -+ spin_lock_irqsave(q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -- spin_unlock_irq(q->queue_lock); -+ spin_unlock_irqrestore(q->queue_lock, flags); - - return icq; - } -@@ -244,7 +249,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { - bfq_log(bfqd, "schedule dispatch"); -- kblockd_schedule_work(&bfqd->unplug_work); -+ blk_mq_run_hw_queues(bfqd->queue, true); - } - } - -@@ -768,9 +773,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - { - int process_refs, io_refs; - -- lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -- -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ io_refs = bfqq->allocated; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; -@@ -1584,6 +1587,7 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) - return sdist; - } - -+#if 0 /* Still not clear if we can do without next two functions */ - static void bfq_activate_request(struct request_queue *q, struct request *rq) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -@@ -1597,8 +1601,10 @@ static void bfq_deactivate_request(struct request_queue *q, struct request *rq) - BUG_ON(bfqd->rq_in_driver == 0); - bfqd->rq_in_driver--; - } -+#endif - --static void bfq_remove_request(struct request *rq) -+static void bfq_remove_request(struct request_queue *q, -+ struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; -@@ -1619,6 +1625,10 @@ static void bfq_remove_request(struct request *rq) - bfqd->queued--; - elv_rb_del(&bfqq->sort_list, rq); - -+ elv_rqhash_del(q, rq); -+ if (q->last_merge == rq) -+ q->last_merge = NULL; -+ - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - bfqq->next_rq = NULL; - -@@ -1659,13 +1669,36 @@ static void bfq_remove_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); - } - --static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -- struct bio *bio) -+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *free = NULL; -+ bool ret; -+ -+ spin_lock_irq(&bfqd->lock); -+ ret = blk_mq_sched_try_merge(q, bio, &free); -+ -+ /* -+ * XXX Not yet freeing without lock held, to avoid an -+ * inconsistency with respect to the lock-protected invocation -+ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting -+ * for clarifications from Jens. -+ */ -+ if (free) -+ blk_mq_free_request(free); -+ spin_unlock_irq(&bfqd->lock); -+ -+ return ret; -+} -+ -+static int bfq_request_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - -- __rq = bfq_find_rq_fmerge(bfqd, bio); -+ __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; -@@ -1674,7 +1707,7 @@ static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, - return ELEVATOR_NO_MERGE; - } - --static void bfq_merged_request(struct request_queue *q, struct request *req, -+static void bfq_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) - { - if (type == ELEVATOR_FRONT_MERGE && -@@ -1689,6 +1722,8 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); - elv_rb_add(&bfqq->sort_list, req); -+ -+ spin_lock_irq(&bfqd->lock); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -@@ -1704,22 +1739,19 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ spin_unlock_irq(&bfqd->lock); - } - } - --#ifdef BFQ_GROUP_IOSCHED_ENABLED --static void bfq_bio_merged(struct request_queue *q, struct request *req, -- struct bio *bio) --{ -- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); --} --#endif -- --static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+static void bfq_requests_merged(struct request_queue *q, struct request *rq, - struct request *next) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - -+ if (!RB_EMPTY_NODE(&rq->rb_node)) -+ goto end; -+ spin_lock_irq(&bfqq->bfqd->lock); -+ - /* - * If next and rq belong to the same bfq_queue and next is older - * than rq, then reposition rq in the fifo (by substituting next -@@ -1740,7 +1772,10 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, - if (bfqq->next_rq == next) - bfqq->next_rq = rq; - -- bfq_remove_request(next); -+ bfq_remove_request(q, next); -+ -+ spin_unlock_irq(&bfqq->bfqd->lock); -+end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); - } - -@@ -1786,7 +1821,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq; - -- spin_lock_irq(bfqd->queue->queue_lock); -+ spin_lock_irq(&bfqd->lock); - - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); -@@ -1794,7 +1829,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) - bfq_bfqq_end_wr(bfqq); - bfq_end_wr_async(bfqd); - -- spin_unlock_irq(bfqd->queue->queue_lock); -+ spin_unlock_irq(&bfqd->lock); - } - - static sector_t bfq_io_struct_pos(void *io_struct, bool request) -@@ -2184,8 +2219,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfq_put_queue(bfqq); - } - --static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); -@@ -2203,7 +2238,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * merge only if rq is queued there. - * Queue lock is held here. - */ -- bic = bfq_bic_lookup(bfqd, current->io_context); -+ bic = bfq_bic_lookup(bfqd, current->io_context, q); - if (!bic) - return false; - -@@ -2228,12 +2263,6 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - return bfqq == RQ_BFQQ(rq); - } - --static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -- struct request *next) --{ -- return RQ_BFQQ(rq) == RQ_BFQQ(next); --} -- - /* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the throughput. -@@ -2264,7 +2293,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - { - if (bfqq) { - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -- bfq_mark_bfqq_must_alloc(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -@@ -2703,27 +2731,28 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - } - - /* -- * Move request from internal lists to the dispatch list of the request queue -+ * Remove request from internal lists. - */ --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - /* -- * For consistency, the next instruction should have been executed -- * after removing the request from the queue and dispatching it. -- * We execute instead this instruction before bfq_remove_request() -- * (and hence introduce a temporary inconsistency), for efficiency. -- * In fact, in a forced_dispatch, this prevents two counters related -- * to bfqq->dispatched to risk to be uselessly decremented if bfqq -- * is not in service, and then to be incremented again after -- * incrementing bfqq->dispatched. -+ * For consistency, the next instruction should have been -+ * executed after removing the request from the queue and -+ * dispatching it. We execute instead this instruction before -+ * bfq_remove_request() (and hence introduce a temporary -+ * inconsistency), for efficiency. In fact, should this -+ * dispatch occur for a non in-service bfqq, this anticipated -+ * increment prevents two counters related to bfqq->dispatched -+ * from risking to be, first, uselessly decremented, and then -+ * incremented again when the (new) value of bfqq->dispatched -+ * happens to be taken into account. - */ - bfqq->dispatched++; - bfq_update_peak_rate(q->elevator->elevator_data, rq); - -- bfq_remove_request(rq); -- elv_dispatch_sort(q, rq); -+ bfq_remove_request(q, rq); - } - - static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -@@ -3605,7 +3634,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && -- !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_wait_request(bfqq) && - !bfq_bfqq_must_idle(bfqq)) - goto expire; - -@@ -3641,7 +3670,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * arrives. - */ - if (bfq_bfqq_wait_request(bfqq)) { -- BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the -@@ -3668,7 +3696,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * for a new request, or has requests waiting for a completion and - * may idle after their completion, then keep it anyway. - */ -- if (hrtimer_active(&bfqd->idle_slice_timer) || -+ if (bfq_bfqq_wait_request(bfqq) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { - bfqq = NULL; - goto keep_queue; -@@ -3753,13 +3781,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - } - - /* -- * Dispatch one request from bfqq, moving it to the request queue -- * dispatch list. -+ * Dispatch next request from bfqq. - */ --static int bfq_dispatch_request(struct bfq_data *bfqd, -- struct bfq_queue *bfqq) -+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) - { -- int dispatched = 0; - struct request *rq = bfqq->next_rq; - unsigned long service_to_charge; - -@@ -3775,7 +3801,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - -- bfq_dispatch_insert(bfqd->queue, rq); -+ bfq_dispatch_remove(bfqd->queue, rq); - - /* - * If weight raising has to terminate for bfqq, then next -@@ -3791,86 +3817,61 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - bfq_update_wr_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, -- "dispatched %u sec req (%llu), budg left %d", -+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", - blk_rq_sectors(rq), - (unsigned long long) blk_rq_pos(rq), -- bfq_bfqq_budget_left(bfqq)); -- -- dispatched++; -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->dispatched); - - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); - } - -+ /* -+ * Expire bfqq, pretending that its budget expired, if bfqq -+ * belongs to CLASS_IDLE and other queues are waiting for -+ * service. -+ */ - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - -- return dispatched; -+ return rq; - - expire: - bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -- return dispatched; --} -- --static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) --{ -- int dispatched = 0; -- -- while (bfqq->next_rq) { -- bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -- dispatched++; -- } -- -- BUG_ON(!list_empty(&bfqq->fifo)); -- return dispatched; -+ return rq; - } - --/* -- * Drain our current requests. -- * Used for barriers and when switching io schedulers on-the-fly. -- */ --static int bfq_forced_dispatch(struct bfq_data *bfqd) -+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { -- struct bfq_queue *bfqq, *n; -- struct bfq_service_tree *st; -- int dispatched = 0; -- -- bfqq = bfqd->in_service_queue; -- if (bfqq) -- __bfq_bfqq_expire(bfqd, bfqq); -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - - /* -- * Loop through classes, and be careful to leave the scheduler -- * in a consistent state, as feedback mechanisms and vtime -- * updates cannot be disabled during the process. -+ * Avoiding lock: a race on bfqd->busy_queues should cause at -+ * most a call to dispatch for nothing - */ -- list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -- st = bfq_entity_service_tree(&bfqq->entity); -- -- dispatched += __bfq_forced_dispatch_bfqq(bfqq); -- -- bfqq->max_budget = bfq_max_budget(bfqd); -- bfq_forget_idle(st); -- } -- -- BUG_ON(bfqd->busy_queues != 0); -- -- return dispatched; -+ return !list_empty_careful(&bfqd->dispatch) || -+ bfqd->busy_queues > 0; - } - --static int bfq_dispatch_requests(struct request_queue *q, int force) -+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq = NULL; -+ struct bfq_queue *bfqq = NULL; -+ -+ if (!list_empty(&bfqd->dispatch)) { -+ rq = list_first_entry(&bfqd->dispatch, struct request, -+ queuelist); -+ list_del_init(&rq->queuelist); -+ goto exit; -+ } - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) -- return 0; -- -- if (unlikely(force)) -- return bfq_forced_dispatch(bfqd); -+ goto exit; - - /* - * Force device to serve one request at a time if -@@ -3885,25 +3886,39 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - * throughput. - */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -- return 0; -+ goto exit; - - bfqq = bfq_select_queue(bfqd); - if (!bfqq) -- return 0; -+ goto exit; - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfq_bfqq_wait_request(bfqq)); - -- if (!bfq_dispatch_request(bfqd, bfqq)) -- return 0; -- -- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -- bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); - - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); -- return 1; -+exit: -+ if (rq) { -+ rq->rq_flags |= RQF_STARTED; -+ bfqd->rq_in_driver++; -+ } -+ -+ return rq; -+} -+ -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ -+ spin_lock_irq(&bfqd->lock); -+ rq = __bfq_dispatch_request(hctx); -+ spin_unlock_irq(&bfqd->lock); -+ -+ return rq; - } - - /* -@@ -3921,13 +3936,14 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ - bfqq->ref--; - if (bfqq->ref) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); -- BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -@@ -3942,7 +3958,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - */ - hlist_del_init(&bfqq->burst_list_node); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -3983,29 +4000,52 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_put_queue(bfqq); /* release process reference */ - } - --static void bfq_exit_icq(struct io_cq *icq) -+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - { -- struct bfq_io_cq *bic = icq_to_bic(icq); -- struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ struct bfq_data *bfqd; - -- if (bic_to_bfqq(bic, false)) { -- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -- bic_set_bfqq(bic, NULL, false); -- } -+ if (bfqq) -+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - -- if (bic_to_bfqq(bic, true)) { -+ if (bfqq && bfqd) { -+ spin_lock_irq(&bfqd->lock); - /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. - */ -- if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -- put_io_context(icq->ioc); -- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -- bic_set_bfqq(bic, NULL, true); -+ if (is_sync && bfq_bfqq_coop(bfqq)) -+ put_io_context(bic->icq.ioc); -+ bfq_exit_bfqq(bfqd, bfqq); -+ bic_set_bfqq(bic, NULL, is_sync); -+ spin_unlock_irq(&bfqd->lock); - } - } - -+static void bfq_exit_icq_body(struct work_struct *work) -+{ -+ struct bfq_io_cq *bic = -+ container_of(work, struct bfq_io_cq, exit_icq_work); -+ -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ kblockd_schedule_work(&bic->exit_icq_work); -+} -+ - /* - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. -@@ -4015,6 +4055,10 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - { - struct task_struct *tsk = current; - int ioprio_class; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ if (!bfqd) -+ return; - - ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - switch (ioprio_class) { -@@ -4095,6 +4139,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -+ spin_lock_init(&bfqq->lock); -+ - bfqq->ref = 0; - bfqq->bfqd = bfqd; - -@@ -4351,22 +4397,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (budget_timeout) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); -- -- /* -- * Let the request rip immediately, or let a new queue be -- * selected if bfqq has just been expired. -- */ -- __blk_run_queue(bfqd->queue); - } - } - --static void bfq_insert_request(struct request_queue *q, struct request *rq) -+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - -- assert_spin_locked(bfqd->queue->queue_lock); -- - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to -@@ -4381,8 +4418,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - * Release the request's reference to the old bfqq - * and make sure one is taken to the shared queue. - */ -- new_bfqq->allocated[rq_data_dir(rq)]++; -- bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->allocated++; -+ bfqq->allocated--; - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -@@ -4406,6 +4443,55 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - bfq_rq_enqueued(bfqd, bfqq, rq); - } - -+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -+ bool at_head) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ spin_lock_irq(&bfqd->lock); -+ if (blk_mq_sched_try_insert_merge(q, rq)) -+ goto done; -+ spin_unlock_irq(&bfqd->lock); -+ -+ blk_mq_sched_request_inserted(rq); -+ -+ spin_lock_irq(&bfqd->lock); -+ if (at_head || blk_rq_is_passthrough(rq)) { -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (at_head) -+ list_add(&rq->queuelist, &bfqd->dispatch); -+ else -+ list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ if (bfqq) -+ bfqq->dispatched++; -+ } else { -+ __bfq_insert_request(bfqd, rq); -+ -+ if (rq_mergeable(rq)) { -+ elv_rqhash_add(q, rq); -+ if (!q->last_merge) -+ q->last_merge = rq; -+ } -+ } -+done: -+ spin_unlock_irq(&bfqd->lock); -+} -+ -+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -+ struct list_head *list, bool at_head) -+{ -+ while (!list_empty(list)) { -+ struct request *rq; -+ -+ rq = list_first_entry(list, struct request, queuelist); -+ list_del_init(&rq->queuelist); -+ bfq_insert_request(hctx, rq, at_head); -+ } -+} -+ - static void bfq_update_hw_tag(struct bfq_data *bfqd) - { - bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -@@ -4431,27 +4517,17 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) - bfqd->hw_tag_samples = 0; - } - --static void bfq_completed_request(struct request_queue *q, struct request *rq) -+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_data *bfqd = bfqq->bfqd; - u64 now_ns; - u32 delta_us; - -- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -- blk_rq_sectors(rq)); -- -- assert_spin_locked(bfqd->queue->queue_lock); - bfq_update_hw_tag(bfqd); - - BUG_ON(!bfqd->rq_in_driver); - BUG_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; -- bfqg_stats_update_completion(bfqq_group(bfqq), -- rq_start_time_ns(rq), -- rq_io_start_time_ns(rq), -- rq->cmd_flags); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -@@ -4477,7 +4553,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - -- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -@@ -4527,7 +4604,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - if (bfqd->in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); -- goto out; -+ return; - } else if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); -@@ -4537,68 +4614,55 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_NO_MORE_REQUESTS); - } -- -- if (!bfqd->rq_in_driver) -- bfq_schedule_dispatch(bfqd); -- --out: -- return; - } - --static int __bfq_may_queue(struct bfq_queue *bfqq) -+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - { -- if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -- bfq_clear_bfqq_must_alloc(bfqq); -- return ELV_MQUEUE_MUST; -- } -+ bfqq->allocated--; - -- return ELV_MQUEUE_MAY; -+ bfq_put_queue(bfqq); - } - --static int bfq_may_queue(struct request_queue *q, unsigned int op) -+static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct task_struct *tsk = current; -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq; -- -- /* -- * Don't force setup of a queue from here, as a call to may_queue -- * does not necessarily imply that a request actually will be -- * queued. So just lookup a possibly existing queue, or return -- * 'may queue' if that fails. -- */ -- bic = bfq_bic_lookup(bfqd, tsk->io_context); -- if (!bic) -- return ELV_MQUEUE_MAY; -- -- bfqq = bic_to_bfqq(bic, op_is_sync(op)); -- if (bfqq) -- return __bfq_may_queue(bfqq); -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; - -- return ELV_MQUEUE_MAY; --} -+ if (rq->rq_flags & RQF_STARTED) -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); - --/* -- * Queue lock held here. -- */ --static void bfq_put_request(struct request *rq) --{ -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ if (likely(rq->rq_flags & RQF_STARTED)) { -+ unsigned long flags; - -- if (bfqq) { -- const int rw = rq_data_dir(rq); -+ spin_lock_irqsave(&bfqd->lock, flags); - -- BUG_ON(!bfqq->allocated[rw]); -- bfqq->allocated[rw]--; -+ bfq_completed_request(bfqq, bfqd); -+ bfq_put_rq_priv_body(bfqq); - -- rq->elv.priv[0] = NULL; -- rq->elv.priv[1] = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } else { -+ /* -+ * Request rq may be still/already in the scheduler, -+ * in which case we need to remove it. And we cannot -+ * defer such a check and removal, to avoid -+ * inconsistencies in the time interval from the end -+ * of this function to the start of the deferred work. -+ * Fortunately, this situation occurs only in process -+ * context, so taking the scheduler lock does not -+ * cause any deadlock, even if other locks are already -+ * (correctly) held by this process. -+ */ - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -- bfqq, bfqq->ref); -- bfq_put_queue(bfqq); -+ if (!RB_EMPTY_NODE(&rq->rb_node)) -+ bfq_remove_request(q, rq); -+ bfq_put_rq_priv_body(bfqq); - } -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; - } - - /* -@@ -4630,18 +4694,16 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - /* - * Allocate bfq data structures associated with this request. - */ --static int bfq_set_request(struct request_queue *q, struct request *rq, -- struct bio *bio, gfp_t gfp_mask) -+static int bfq_get_rq_private(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -- const int rw = rq_data_dir(rq); - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; -- unsigned long flags; - bool bfqq_already_existing = false, split = false; - -- spin_lock_irqsave(q->queue_lock, flags); -+ spin_lock_irq(&bfqd->lock); - - if (!bic) - goto queue_fail; -@@ -4661,7 +4723,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: was_in_list %d " -+ "get_request: was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, -@@ -4671,12 +4733,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: marking in " -+ "get_request: marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: clearing in " -+ "get_request: clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -@@ -4703,9 +4765,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - } - } - -- bfqq->allocated[rw]++; -+ bfqq->allocated++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "get_request: new allocated %d", bfqq->allocated); -+ - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -4733,26 +4798,53 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- spin_unlock_irqrestore(q->queue_lock, flags); -+ spin_unlock_irq(&bfqd->lock); - - return 0; - - queue_fail: -- bfq_schedule_dispatch(bfqd); -- spin_unlock_irqrestore(q->queue_lock, flags); -+ spin_unlock_irq(&bfqd->lock); - - return 1; - } - --static void bfq_kick_queue(struct work_struct *work) -+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = -- container_of(work, struct bfq_data, unplug_work); -- struct request_queue *q = bfqd->queue; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ enum bfqq_expiration reason; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_clear_bfqq_wait_request(bfqq); - -- spin_lock_irq(q->queue_lock); -- __blk_run_queue(q); -- spin_unlock_irq(q->queue_lock); -+ if (bfqq != bfqd->in_service_queue) { -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ return; -+ } -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ -+schedule_dispatch: -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_schedule_dispatch(bfqd); - } - - /* -@@ -4763,59 +4855,22 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - { - struct bfq_data *bfqd = container_of(timer, struct bfq_data, - idle_slice_timer); -- struct bfq_queue *bfqq; -- unsigned long flags; -- enum bfqq_expiration reason; -- -- spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ struct bfq_queue *bfqq = bfqd->in_service_queue; - -- bfqq = bfqd->in_service_queue; - /* - * Theoretical race here: the in-service queue can be NULL or -- * different from the queue that was idling if the timer handler -- * spins on the queue_lock and a new request arrives for the -- * current queue and there is a full dispatch cycle that changes -- * the in-service queue. This can hardly happen, but in the worst -- * case we just expire a queue too early. -+ * different from the queue that was idling if a new request -+ * arrives for the current queue and there is a full dispatch -+ * cycle that changes the in-service queue. This can hardly -+ * happen, but in the worst case we just expire a queue too -+ * early. - */ -- if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -- bfq_clear_bfqq_wait_request(bfqq); -- -- if (bfq_bfqq_budget_timeout(bfqq)) -- /* -- * Also here the queue can be safely expired -- * for budget timeout without wasting -- * guarantees -- */ -- reason = BFQ_BFQQ_BUDGET_TIMEOUT; -- else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -- /* -- * The queue may not be empty upon timer expiration, -- * because we may not disable the timer when the -- * first request of the in-service queue arrives -- * during disk idling. -- */ -- reason = BFQ_BFQQ_TOO_IDLE; -- else -- goto schedule_dispatch; -- -- bfq_bfqq_expire(bfqd, bfqq, true, reason); -- } -- --schedule_dispatch: -- bfq_schedule_dispatch(bfqd); -+ if (bfqq) -+ bfq_idle_slice_timer_body(bfqq); - -- spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); - return HRTIMER_NORESTART; - } - --static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) --{ -- hrtimer_cancel(&bfqd->idle_slice_timer); -- cancel_work_sync(&bfqd->unplug_work); --} -- - static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_queue **bfqq_ptr) - { -@@ -4852,28 +4907,40 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) - static void bfq_exit_queue(struct elevator_queue *e) - { - struct bfq_data *bfqd = e->elevator_data; -- struct request_queue *q = bfqd->queue; - struct bfq_queue *bfqq, *n; - -- bfq_shutdown_timer_wq(bfqd); -- -- spin_lock_irq(q->queue_lock); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(bfqd->in_service_queue); -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -- bfq_deactivate_bfqq(bfqd, bfqq, false, false); - -- spin_unlock_irq(q->queue_lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ if (bfqq->bic) /* bfqqs without bic are handled below */ -+ cancel_work_sync(&bfqq->bic->exit_icq_work); -+ } -+ -+ spin_lock_irq(&bfqd->lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ /* -+ * Make sure that deferred exit_icq_work completes -+ * without errors for bfq_queues without bic -+ */ -+ if (!bfqq->bic) -+ bfqq->bfqd = NULL; -+ } -+ spin_unlock_irq(&bfqd->lock); - -- bfq_shutdown_timer_wq(bfqd); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else -+ spin_lock_irq(&bfqd->lock); - bfq_put_async_queues(bfqd, bfqd->root_group); - kfree(bfqd->root_group); -+ spin_unlock_irq(&bfqd->lock); - #endif - - kfree(bfqd); -@@ -4934,10 +5001,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - - bfqd->queue = q; - -- spin_lock_irq(q->queue_lock); -- q->elevator = eq; -- spin_unlock_irq(q->queue_lock); -- - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); - if (!bfqd->root_group) - goto out_free; -@@ -4951,8 +5014,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; - -- INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -- - INIT_LIST_HEAD(&bfqd->active_list); - INIT_LIST_HEAD(&bfqd->idle_list); - INIT_HLIST_HEAD(&bfqd->burst_list); -@@ -5001,6 +5062,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - bfqd->device_speed = BFQ_BFQD_FAST; - -+ spin_lock_init(&bfqd->lock); -+ INIT_LIST_HEAD(&bfqd->dispatch); -+ -+ q->elevator = eq; -+ - return 0; - - out_free: -@@ -5057,7 +5123,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", - bfqd->queued); - -- spin_lock_irq(bfqd->queue->queue_lock); -+ spin_lock_irq(&bfqd->lock); - - num_char += sprintf(page + num_char, "Active:\n"); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -@@ -5086,7 +5152,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - -- spin_unlock_irq(bfqd->queue->queue_lock); -+ spin_unlock_irq(&bfqd->lock); - - return num_char; - } -@@ -5294,35 +5360,31 @@ static struct elv_fs_entry bfq_attrs[] = { - __ATTR_NULL - }; - --static struct elevator_type iosched_bfq = { -- .ops.sq = { -- .elevator_merge_fn = bfq_merge, -- .elevator_merged_fn = bfq_merged_request, -- .elevator_merge_req_fn = bfq_merged_requests, --#ifdef BFQ_GROUP_IOSCHED_ENABLED -- .elevator_bio_merged_fn = bfq_bio_merged, --#endif -- .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -- .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -- .elevator_dispatch_fn = bfq_dispatch_requests, -- .elevator_add_req_fn = bfq_insert_request, -- .elevator_activate_req_fn = bfq_activate_request, -- .elevator_deactivate_req_fn = bfq_deactivate_request, -- .elevator_completed_req_fn = bfq_completed_request, -- .elevator_former_req_fn = elv_rb_former_request, -- .elevator_latter_req_fn = elv_rb_latter_request, -- .elevator_init_icq_fn = bfq_init_icq, -- .elevator_exit_icq_fn = bfq_exit_icq, -- .elevator_set_req_fn = bfq_set_request, -- .elevator_put_req_fn = bfq_put_request, -- .elevator_may_queue_fn = bfq_may_queue, -- .elevator_init_fn = bfq_init_queue, -- .elevator_exit_fn = bfq_exit_queue, -+static struct elevator_type iosched_bfq_mq = { -+ .ops.mq = { -+ .get_rq_priv = bfq_get_rq_private, -+ .put_rq_priv = bfq_put_rq_private, -+ .init_icq = bfq_init_icq, -+ .exit_icq = bfq_exit_icq, -+ .insert_requests = bfq_insert_requests, -+ .dispatch_request = bfq_dispatch_request, -+ .next_request = elv_rb_latter_request, -+ .former_request = elv_rb_former_request, -+ .allow_merge = bfq_allow_bio_merge, -+ .bio_merge = bfq_bio_merge, -+ .request_merge = bfq_request_merge, -+ .requests_merged = bfq_requests_merged, -+ .request_merged = bfq_request_merged, -+ .has_work = bfq_has_work, -+ .init_sched = bfq_init_queue, -+ .exit_sched = bfq_exit_queue, - }, -+ -+ .uses_mq = true, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - .elevator_attrs = bfq_attrs, -- .elevator_name = "bfq-sq", -+ .elevator_name = "bfq-mq", - .elevator_owner = THIS_MODULE, - }; - -@@ -5392,7 +5454,7 @@ static int __init bfq_init(void) - device_speed_thresh[0] = (4 * R_slow[0]) / 3; - device_speed_thresh[1] = (4 * R_slow[1]) / 3; - -- ret = elv_register(&iosched_bfq); -+ ret = elv_register(&iosched_bfq_mq); - if (ret) - goto err_pol_unreg; - -@@ -5412,8 +5474,8 @@ static int __init bfq_init(void) - - static void __exit bfq_exit(void) - { -- elv_unregister(&iosched_bfq); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ elv_unregister(&iosched_bfq_mq); -+#ifdef CONFIG_BFQ_GROUP_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -@@ -5422,5 +5484,6 @@ static void __exit bfq_exit(void) - module_init(bfq_init); - module_exit(bfq_exit); - --MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_AUTHOR("Paolo Valente"); - MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 0f51f270469c..c3fcd5ebd735 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -19,15 +19,8 @@ - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - --/* -- * Define an alternative macro to compile cgroups support. This is one -- * of the steps needed to let bfq-mq share the files bfq-sched.c and -- * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -- * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -- * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -- * CONFIG_BFQ_GROUP_IOSCHED, is defined. -- */ --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ -+#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED - #define BFQ_GROUP_IOSCHED_ENABLED - #endif - -@@ -259,8 +252,8 @@ struct bfq_queue { - struct request *next_rq; - /* number of sync and async requests queued */ - int queued[2]; -- /* number of sync and async requests currently allocated */ -- int allocated[2]; -+ /* number of requests currently allocated */ -+ int allocated; - /* number of pending metadata requests */ - int meta_pending; - /* fifo list of requests in sort_list */ -@@ -345,6 +338,8 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ -+ spinlock_t lock; - }; - - /** -@@ -361,6 +356,9 @@ struct bfq_io_cq { - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -+ /* delayed work to exec the body of the the exit_icq handler */ -+ struct work_struct exit_icq_work; -+ - /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to -@@ -402,11 +400,13 @@ enum bfq_device_speed { - /** - * struct bfq_data - per-device data structure. - * -- * All the fields are protected by the @queue lock. -+ * All the fields are protected by @lock. - */ - struct bfq_data { -- /* request queue for the device */ -+ /* device request queue */ - struct request_queue *queue; -+ /* dispatch queue */ -+ struct list_head dispatch; - - /* root bfq_group for the device */ - struct bfq_group *root_group; -@@ -460,8 +460,6 @@ struct bfq_data { - * the queue in service. - */ - struct hrtimer idle_slice_timer; -- /* delayed work to restart dispatching on the request queue */ -- struct work_struct unplug_work; - - /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -@@ -612,6 +610,8 @@ struct bfq_data { - - /* fallback dummy bfqq for extreme OOM conditions */ - struct bfq_queue oom_bfqq; -+ -+ spinlock_t lock; - }; - - enum bfqq_state_flags { -@@ -622,7 +622,6 @@ enum bfqq_state_flags { - * waiting for a request - * without idling the device - */ -- BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -@@ -661,7 +660,6 @@ BFQ_BFQQ_FNS(just_created); - BFQ_BFQQ_FNS(busy); - BFQ_BFQQ_FNS(wait_request); - BFQ_BFQQ_FNS(non_blocking_wait_rq); --BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); - BFQ_BFQQ_FNS(has_short_ttime); - BFQ_BFQQ_FNS(sync); -@@ -692,7 +690,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ -- assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s bfq%d%c %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -@@ -734,7 +731,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ -- assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ -@@ -961,7 +957,6 @@ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) - - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); - static void bfq_put_queue(struct bfq_queue *bfqq); --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); - -From bde5235de2241502c1c00337bd51c96d9b60b6df Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 3 Mar 2017 08:52:40 +0100 -Subject: [PATCH 13/51] Add checks and extra log messages - Part I - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++-- - 1 file changed, 109 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index c963d92a32c2..40eadb3f7073 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -773,6 +773,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - { - int process_refs, io_refs; - -+ lockdep_assert_held(&bfqq->bfqd->lock); -+ - io_refs = bfqq->allocated; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); -@@ -1483,6 +1485,8 @@ static void bfq_add_request(struct request *rq) - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(RQ_BFQQ(rq) != bfqq); - elv_rb_add(&bfqq->sort_list, rq); - - /* -@@ -1491,6 +1495,8 @@ static void bfq_add_request(struct request *rq) - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(!next_rq); -+ BUG_ON(!RQ_BFQQ(next_rq)); -+ BUG_ON(RQ_BFQQ(next_rq) != bfqq); - bfqq->next_rq = next_rq; - - /* -@@ -1615,6 +1621,19 @@ static void bfq_remove_request(struct request_queue *q, - - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { -+ pr_crit("no bfqq! for next rq %p bfqq %p\n", -+ bfqq->next_rq, bfqq); -+ } -+ -+ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); -+ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { -+ pr_crit( -+ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", -+ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); -+ } -+ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); -+ - bfq_updated_next_req(bfqd, bfqq); - } - -@@ -1701,6 +1720,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; -+ bfq_log(bfqd, "request_merge: req %p", __rq); -+ - return ELEVATOR_FRONT_MERGE; - } - -@@ -1721,6 +1742,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); -+ BUG_ON(!RQ_BFQQ(req)); -+ BUG_ON(RQ_BFQQ(req) != bfqq); - elv_rb_add(&bfqq->sort_list, req); - - spin_lock_irq(&bfqd->lock); -@@ -1729,7 +1752,13 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, - bfqd->last_position); - BUG_ON(!next_rq); -+ - bfqq->next_rq = next_rq; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "requests_merged: req %p prev %p next_rq %p bfqq %p", -+ req, prev, next_rq, bfqq); -+ - /* - * If next_rq changes, update both the queue's budget to - * fit the new request and the queue's position in its -@@ -1748,8 +1777,16 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(!RQ_BFQQ(next)); -+ - if (!RB_EMPTY_NODE(&rq->rb_node)) - goto end; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "requests_merged: rq %p next %p bfqq %p next_bfqq %p", -+ rq, next, bfqq, next_bfqq); -+ - spin_lock_irq(&bfqq->bfqd->lock); - - /* -@@ -3847,6 +3884,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - -+ bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", -+ !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); -+ - /* - * Avoiding lock: a race on bfqd->busy_queues should cause at - * most a call to dispatch for nothing -@@ -3865,6 +3905,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ bfq_log(bfqd, -+ "dispatch requests: picked %p from dispatch list", rq); - goto exit; - } - -@@ -3904,7 +3946,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - if (rq) { - rq->rq_flags |= RQF_STARTED; - bfqd->rq_in_driver++; -- } -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %s request %p, rq_in_driver %d", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async", -+ rq, -+ bfqd->rq_in_driver); -+ else -+ bfq_log(bfqd, -+ "dispatched request %p from dispatch list, rq_in_driver %d", -+ rq, bfqd->rq_in_driver); -+ } else -+ bfq_log(bfqd, -+ "returned NULL request, rq_in_driver %d", -+ bfqd->rq_in_driver); - - return rq; - } -@@ -3944,6 +3999,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -@@ -4043,6 +4099,7 @@ static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); - -+ BUG_ON(!bic); - kblockd_schedule_work(&bic->exit_icq_work); - } - -@@ -4057,6 +4114,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - int ioprio_class; - struct bfq_data *bfqd = bfqq->bfqd; - -+ WARN_ON(!bfqd); - if (!bfqd) - return; - -@@ -4404,6 +4462,10 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - -+ assert_spin_locked(&bfqd->lock); -+ -+ bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); -+ - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to -@@ -4420,6 +4482,12 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - */ - new_bfqq->allocated++; - bfqq->allocated--; -+ bfq_log_bfqq(bfqd, bfqq, -+ "insert_request: new allocated %d", bfqq->allocated); -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "insert_request: new_bfqq new allocated %d", -+ bfqq->allocated); -+ - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -@@ -4529,6 +4597,10 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfqd->rq_in_driver--; - bfqq->dispatched--; - -+ bfq_log_bfqq(bfqd, bfqq, -+ "completed_requests: new disp %d, new rq_in_driver %d", -+ bfqq->dispatched, bfqd->rq_in_driver); -+ - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - /* -@@ -4618,6 +4690,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - - static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "put_request_body: allocated %d", bfqq->allocated); -+ BUG_ON(!bfqq->allocated); - bfqq->allocated--; - - bfq_put_queue(bfqq); -@@ -4625,8 +4700,27 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - - static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_data *bfqd = bfqq->bfqd; -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd; -+ struct bfq_io_cq *bic; -+ -+ BUG_ON(!rq); -+ bfqq = RQ_BFQQ(rq); -+ BUG_ON(!bfqq); -+ -+ bic = RQ_BIC(rq); -+ BUG_ON(!bic); -+ -+ bfqd = bfqq->bfqd; -+ BUG_ON(!bfqd); -+ -+ BUG_ON(rq->rq_flags & RQF_QUEUED); -+ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "putting rq %p with %u sects left, STARTED %d", -+ rq, blk_rq_sectors(rq), -+ rq->rq_flags & RQF_STARTED); - - if (rq->rq_flags & RQF_STARTED) - bfqg_stats_update_completion(bfqq_group(bfqq), -@@ -4634,6 +4728,8 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - rq_io_start_time_ns(rq), - rq->cmd_flags); - -+ BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ - if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - -@@ -4655,7 +4751,9 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - * cause any deadlock, even if other locks are already - * (correctly) held by this process. - */ -+ BUG_ON(in_interrupt()); - -+ assert_spin_locked(&bfqd->lock); - if (!RB_EMPTY_NODE(&rq->rb_node)) - bfq_remove_request(q, rq); - bfq_put_rq_priv_body(bfqq); -@@ -4814,7 +4912,9 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - enum bfqq_expiration reason; - unsigned long flags; - -+ BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfqq != bfqd->in_service_queue) { -@@ -4857,6 +4957,8 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - idle_slice_timer); - struct bfq_queue *bfqq = bfqd->in_service_queue; - -+ bfq_log(bfqd, "slice_timer expired"); -+ - /* - * Theoretical race here: the in-service queue can be NULL or - * different from the queue that was idling if a new request -@@ -4909,9 +5011,12 @@ static void bfq_exit_queue(struct elevator_queue *e) - struct bfq_data *bfqd = e->elevator_data; - struct bfq_queue *bfqq, *n; - -+ bfq_log(bfqd, "exit_queue: starting ..."); -+ - hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(bfqd->in_service_queue); -+ BUG_ON(!list_empty(&bfqd->active_list)); - - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { - if (bfqq->bic) /* bfqqs without bic are handled below */ -@@ -4943,6 +5048,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_unlock_irq(&bfqd->lock); - #endif - -+ bfq_log(bfqd, "exit_queue: finished ..."); - kfree(bfqd); - } - - -From 7f59486861e368d25f59d4136cf8e51a75b7edf9 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 9 Feb 2017 10:36:27 +0100 -Subject: [PATCH 14/51] Add lock check in bfq_allow_bio_merge - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 40eadb3f7073..21b876aeba16 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2279,6 +2279,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - if (!bic) - return false; - -+ assert_spin_locked(&bfqd->lock); - bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - -From a2dd19a4d95cf401268c144c79ce549c7fc4bbca Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 7 Feb 2017 15:14:29 +0100 -Subject: [PATCH 15/51] bfq-mq: execute exit_icq operations immediately - -Exploting Omar's patch that removes the taking of the queue lock in -put_io_context_active, this patch moves back the operation of the bfq_exit_icq -hook from a deferred work to the body of the function. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 34 +++------------------------------- - block/bfq-mq.h | 3 --- - 2 files changed, 3 insertions(+), 34 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 21b876aeba16..1deb79a47181 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4080,28 +4080,13 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - } - } - --static void bfq_exit_icq_body(struct work_struct *work) --{ -- struct bfq_io_cq *bic = -- container_of(work, struct bfq_io_cq, exit_icq_work); -- -- bfq_exit_icq_bfqq(bic, true); -- bfq_exit_icq_bfqq(bic, false); --} -- --static void bfq_init_icq(struct io_cq *icq) --{ -- struct bfq_io_cq *bic = icq_to_bic(icq); -- -- INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); --} -- - static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); - - BUG_ON(!bic); -- kblockd_schedule_work(&bic->exit_icq_work); -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); - } - - /* -@@ -5019,21 +5004,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - BUG_ON(bfqd->in_service_queue); - BUG_ON(!list_empty(&bfqd->active_list)); - -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -- if (bfqq->bic) /* bfqqs without bic are handled below */ -- cancel_work_sync(&bfqq->bic->exit_icq_work); -- } -- - spin_lock_irq(&bfqd->lock); -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -- /* -- * Make sure that deferred exit_icq_work completes -- * without errors for bfq_queues without bic -- */ -- if (!bfqq->bic) -- bfqq->bfqd = NULL; -- } - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -@@ -5471,7 +5444,6 @@ static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .get_rq_priv = bfq_get_rq_private, - .put_rq_priv = bfq_put_rq_private, -- .init_icq = bfq_init_icq, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index c3fcd5ebd735..23744b246db6 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -356,9 +356,6 @@ struct bfq_io_cq { - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -- /* delayed work to exec the body of the the exit_icq handler */ -- struct work_struct exit_icq_work; -- - /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to - -From ab7e78a0ff095101de74e700f8743295a500bb20 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 21 Feb 2017 10:26:22 +0100 -Subject: [PATCH 16/51] Unnest request-queue and ioc locks from scheduler locks - -In some bio-merging functions, the request-queue lock needs to be -taken, to lookup for the bic associated with the process that issued -the bio that may need to be merged. In addition, put_io_context must -be invoked in some other functions, and put_io_context may cause the -lock of the involved ioc to be taken. In both cases, these extra -request-queue or ioc locks are taken, or might be taken, while the -scheduler lock is being held. In this respect, there are other code -paths, in part external to bfq-mq, in which the same locks are taken -(nested) in the opposite order, i.e., it is the scheduler lock to be -taken while the request-queue or the ioc lock is being held. This -leads to circular deadlocks. - -This commit addresses this issue by modifying the logic of the above -functions, so as to let the lookup and put_io_context be performed, -and thus the extra locks be taken, outside the critical sections -protected by the scheduler lock. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 9 ++ - block/bfq-mq-iosched.c | 264 ++++++++++++++++++++++++++++---------------- - block/bfq-mq.h | 25 ++++- - block/bfq-sched.c | 11 ++ - 4 files changed, 213 insertions(+), 96 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 8a73de76f32b..cf59eeb7f08e 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -716,6 +716,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - struct bfq_group *bfqg; - struct bfq_data *bfqd; - struct bfq_entity *entity; -+#ifdef BFQ_MQ -+ unsigned long flags; -+#endif - int i; - - BUG_ON(!pd); -@@ -729,6 +732,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - if (!entity) /* root group */ - return; - -+#ifdef BFQ_MQ -+ spin_lock_irqsave(&bfqd->lock, flags); -+#endif - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -766,6 +772,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - __bfq_deactivate_entity(entity, false); - bfq_put_async_queues(bfqd, bfqg); - -+#ifdef BFQ_MQ -+ bfq_unlock_put_ioc_restore(bfqd, flags); -+#endif - /* - * @blkg is going offline and will be ignored by - * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 1deb79a47181..69ef3761c95d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -233,6 +233,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - return NULL; - } - -+#define BFQ_MQ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" - -@@ -1564,15 +1565,9 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio, - struct request_queue *q) - { -- struct task_struct *tsk = current; -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq; -+ struct bfq_queue *bfqq = bfqd->bio_bfqq; - -- bic = bfq_bic_lookup(bfqd, tsk->io_context, q); -- if (!bic) -- return NULL; - -- bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); - -@@ -1693,9 +1688,26 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *free = NULL; -+ /* -+ * bfq_bic_lookup grabs the queue_lock: invoke it now and -+ * store its return value for later use, to avoid nesting -+ * queue_lock inside the bfqd->lock. We assume that the bic -+ * returned by bfq_bic_lookup does not go away before -+ * bfqd->lock is taken. -+ */ -+ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); - bool ret; - - spin_lock_irq(&bfqd->lock); -+ -+ if (bic) -+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ else -+ bfqd->bio_bfqq = NULL; -+ bfqd->bio_bic = bic; -+ /* Set next flag just for testing purposes */ -+ bfqd->bio_bfqq_set = true; -+ - ret = blk_mq_sched_try_merge(q, bio, &free); - - /* -@@ -1706,6 +1718,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - */ - if (free) - blk_mq_free_request(free); -+ bfqd->bio_bfqq_set = false; - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -2261,8 +2274,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - { - struct bfq_data *bfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq, *new_bfqq; -+ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. -@@ -2273,31 +2285,40 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - /* - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. -- * Queue lock is held here. - */ -- bic = bfq_bic_lookup(bfqd, current->io_context, q); -- if (!bic) -+ if (!bfqq) - return false; - -- assert_spin_locked(&bfqd->lock); -- bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. - */ -- if (bfqq) { -- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -- if (new_bfqq) { -- bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -- /* -- * If we get here, the bio will be queued in the -- * shared queue, i.e., new_bfqq, so use new_bfqq -- * to decide whether bio and rq can be merged. -- */ -- bfqq = new_bfqq; -- } -- } -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ /* -+ * bic still points to bfqq, then it has not yet been -+ * redirected to some other bfq_queue, and a queue -+ * merge beween bfqq and new_bfqq can be safely -+ * fulfillled, i.e., bic can be redirected to new_bfqq -+ * and bfqq can be put. -+ */ -+ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, -+ new_bfqq); -+ /* -+ * If we get here, bio will be queued into new_queue, -+ * so use new_bfqq to decide whether bio and rq can be -+ * merged. -+ */ -+ bfqq = new_bfqq; - -+ /* -+ * Change also bqfd->bio_bfqq, as -+ * bfqd->bio_bic now points to new_bfqq, and -+ * this function may be invoked again (and then may -+ * use again bqfd->bio_bfqq). -+ */ -+ bfqd->bio_bfqq = bfqq; -+ } - return bfqq == RQ_BFQQ(rq); - } - -@@ -3965,14 +3986,43 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - -+/* -+ * Next two functions release bfqd->lock and put the io context -+ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -+ * to take an ioc->lock while the scheduler lock is being held. -+ */ -+static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irq(&bfqd->lock); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -+ unsigned long flags) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; - - spin_lock_irq(&bfqd->lock); -+ - rq = __bfq_dispatch_request(hctx); -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc(bfqd); - - return rq; - } -@@ -3981,7 +4031,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * Task holds one reference to the queue, dropped when task exits. Each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * -- * Queue lock must be held here. Recall not to use bfqq after calling -+ * Scheduler lock must be held here. Recall not to use bfqq after calling - * this function on it. - */ - static void bfq_put_queue(struct bfq_queue *bfqq) -@@ -4066,17 +4116,23 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - - if (bfqq && bfqd) { -- spin_lock_irq(&bfqd->lock); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); - /* -- * If the bic is using a shared queue, put the reference -- * taken on the io_context when the bic started using a -- * shared bfq_queue. -+ * If the bic is using a shared queue, put the -+ * reference taken on the io_context when the bic -+ * started using a shared bfq_queue. This put cannot -+ * make ioc->ref_count reach 0, then no ioc->lock -+ * risks to be taken (leading to possible deadlock -+ * scenarios). - */ - if (is_sync && bfq_bfqq_coop(bfqq)) - put_io_context(bic->icq.ioc); -+ - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - } - } - -@@ -4183,8 +4239,6 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -- spin_lock_init(&bfqq->lock); -- - bfqq->ref = 0; - bfqq->bfqd = bfqd; - -@@ -4476,6 +4530,14 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); -+ /* -+ * If the bic associated with the process -+ * issuing this request still points to bfqq -+ * (and thus has not been already redirected -+ * to new_bfqq or even some other bfq_queue), -+ * then complete the merge and redirect it to -+ * new_bfqq. -+ */ - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -@@ -4498,14 +4560,17 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - } - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -- bool at_head) -+ bool at_head) - { - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - - spin_lock_irq(&bfqd->lock); -- if (blk_mq_sched_try_insert_merge(q, rq)) -- goto done; -+ if (blk_mq_sched_try_insert_merge(q, rq)) { -+ spin_unlock_irq(&bfqd->lock); -+ return; -+ } -+ - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4530,8 +4595,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --done: -- spin_unlock_irq(&bfqd->lock); -+ -+ bfq_unlock_put_ioc(bfqd); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -@@ -4724,7 +4789,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); - -- spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -@@ -4732,10 +4797,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. -- * Fortunately, this situation occurs only in process -- * context, so taking the scheduler lock does not -- * cause any deadlock, even if other locks are already -- * (correctly) held by this process. -+ * This situation seems to occur only in process -+ * context, as a consequence of a merge. In the -+ * current version of the code, this implies that the -+ * lock is held. - */ - BUG_ON(in_interrupt()); - -@@ -4758,8 +4823,6 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - -- put_io_context(bic->icq.ioc); -- - if (bfqq_process_refs(bfqq) == 1) { - bfqq->pid = current->pid; - bfq_clear_bfqq_coop(bfqq); -@@ -4775,6 +4838,41 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - return NULL; - } - -+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct bio *bio, -+ bool split, bool is_sync, -+ bool *new_queue) -+{ -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ -+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) -+ return bfqq; -+ -+ if (new_queue) -+ *new_queue = true; -+ -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ else { -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ -+ return bfqq; -+} -+ - /* - * Allocate bfq data structures associated with this request. - */ -@@ -4786,6 +4884,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - bool bfqq_already_existing = false, split = false; -+ bool new_queue = false; - - spin_lock_irq(&bfqd->lock); - -@@ -4796,42 +4895,10 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfq_bic_update_cgroup(bic, bio); - --new_queue: -- bfqq = bic_to_bfqq(bic, is_sync); -- if (!bfqq || bfqq == &bfqd->oom_bfqq) { -- if (bfqq) -- bfq_put_queue(bfqq); -- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -- BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, -+ &new_queue); - -- bic_set_bfqq(bic, bfqq, is_sync); -- if (split && is_sync) { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: was_in_list %d " -- "was_in_large_burst %d " -- "large burst in progress %d", -- bic->was_in_burst_list, -- bic->saved_in_large_burst, -- bfqd->large_burst); -- -- if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: marking in " -- "large burst"); -- bfq_mark_bfqq_in_large_burst(bfqq); -- } else { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: clearing in " -- "large burst"); -- bfq_clear_bfqq_in_large_burst(bfqq); -- if (bic->was_in_burst_list) -- hlist_add_head(&bfqq->burst_list_node, -- &bfqd->burst_list); -- } -- bfqq->split_time = jiffies; -- } -- } else { -+ if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -@@ -4841,9 +4908,19 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -- split = true; -+ /* -+ * A reference to bic->icq.ioc needs to be -+ * released after a queue split. Do not do it -+ * immediately, to not risk to possibly take -+ * an ioc->lock while holding the scheduler -+ * lock. -+ */ -+ bfqd->ioc_to_put = bic->icq.ioc; -+ - if (!bfqq) -- goto new_queue; -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -+ true, is_sync, -+ NULL); - else - bfqq_already_existing = true; - } -@@ -4861,18 +4938,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - /* - * If a bfq_queue has only one process reference, it is owned -- * by only one bfq_io_cq: we can set the bic field of the -- * bfq_queue to the address of that structure. Also, if the -- * queue has just been split, mark a flag so that the -- * information is available to the other scheduler hooks. -+ * by only this bic: we can then set bfqq->bic = bic. in -+ * addition, if the queue has also just been split, we have to -+ * resume its state. - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; -- if (split) { -+ if (bfqd->ioc_to_put) { /* if true, then there has been a split */ - /* -- * If the queue has just been split from a shared -- * queue, restore the idle window and the possible -- * weight raising period. -+ * The queue has just been split from a shared -+ * queue: restore the idle window and the -+ * possible weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); -@@ -4882,7 +4958,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc(bfqd); - - return 0; - -@@ -4929,7 +5005,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - bfq_bfqq_expire(bfqd, bfqq, true, reason); - - schedule_dispatch: -- spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - bfq_schedule_dispatch(bfqd); - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 23744b246db6..bd83f1c02573 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -338,8 +338,6 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -- -- spinlock_t lock; - }; - - /** -@@ -609,6 +607,29 @@ struct bfq_data { - struct bfq_queue oom_bfqq; - - spinlock_t lock; -+ -+ /* -+ * bic associated with the task issuing current bio for -+ * merging. This and the next field are used as a support to -+ * be able to perform the bic lookup, needed by bio-merge -+ * functions, before the scheduler lock is taken, and thus -+ * avoid taking the request-queue lock while the scheduler -+ * lock is being held. -+ */ -+ struct bfq_io_cq *bio_bic; -+ /* bfqq associated with the task issuing current bio for merging */ -+ struct bfq_queue *bio_bfqq; -+ /* Extra flag used only for TESTING */ -+ bool bio_bfqq_set; -+ -+ /* -+ * io context to put right after bfqd->lock is released. This -+ * filed is used to perform put_io_context, when needed, to -+ * after the scheduler lock has been released, and thus -+ * prevent an ioc->lock from being possibly taken while the -+ * scheduler lock is being held. -+ */ -+ struct io_context *ioc_to_put; - }; - - enum bfqq_state_flags { -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index b54a638186e3..a5c8b4acd33c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1905,7 +1905,18 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *entity = in_serv_entity; - - if (bfqd->in_service_bic) { -+#ifdef BFQ_MQ -+ /* -+ * Schedule the release of a reference to -+ * bfqd->in_service_bic->icq.ioc to right after the -+ * scheduler lock is released. This ioc is not -+ * released immediately, to not risk to possibly take -+ * an ioc->lock while holding the scheduler lock. -+ */ -+ bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; -+#else - put_io_context(bfqd->in_service_bic->icq.ioc); -+#endif - bfqd->in_service_bic = NULL; - } - - -From 84cc7140cb4f0574710625f51abbb076a1dd2920 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 3 Mar 2017 09:31:14 +0100 -Subject: [PATCH 17/51] Add checks and extra log messages - Part II - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 42 ++++++++++++++++++++++++++++++++++++++++-- - block/bfq-sched.c | 1 + - 2 files changed, 41 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 69ef3761c95d..5707d42b160d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1567,6 +1567,7 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - { - struct bfq_queue *bfqq = bfqd->bio_bfqq; - -+ BUG_ON(!bfqd->bio_bfqq_set); - - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -@@ -1719,6 +1720,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - if (free) - blk_mq_free_request(free); - bfqd->bio_bfqq_set = false; -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -1781,6 +1783,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - } - } -@@ -1824,6 +1827,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - bfq_remove_request(q, next); - -+ BUG_ON(bfqq->bfqd->ioc_to_put); - spin_unlock_irq(&bfqq->bfqd->lock); - end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -@@ -2195,9 +2199,11 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - { - bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", - (unsigned long) new_bfqq->pid); -+ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); - /* Save weight raising and idle window of the merged queues */ - bfq_bfqq_save_state(bfqq); - bfq_bfqq_save_state(new_bfqq); -+ - if (bfq_bfqq_IO_bound(bfqq)) - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); -@@ -2276,6 +2282,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - bool is_sync = op_is_sync(bio->bi_opf); - struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; - -+ assert_spin_locked(&bfqd->lock); - /* - * Disallow merge of a sync bio into an async request. - */ -@@ -2286,6 +2293,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. - */ -+ BUG_ON(!bfqd->bio_bfqq_set); - if (!bfqq) - return false; - -@@ -2294,6 +2302,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * of the queues of possible cooperating processes. - */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ BUG_ON(new_bfqq == bfqq); - if (new_bfqq) { - /* - * bic still points to bfqq, then it has not yet been -@@ -4040,6 +4049,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -+ assert_spin_locked(&bfqq->bfqd->lock); -+ - BUG_ON(bfqq->ref <= 0); - - if (bfqq->bfqd) -@@ -4119,6 +4130,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); - /* - * If the bic is using a shared queue, put the - * reference taken on the io_context when the bic -@@ -4567,10 +4579,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - return; - } - -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4785,6 +4799,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); - - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); -@@ -4855,13 +4870,28 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - if (bfqq) - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ - if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: marking in " -+ "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); -- else { -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: clearing in " -+ "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, -@@ -4897,10 +4927,12 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); -+ BUG_ON(bfqd->ioc_to_put); - - if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ BUG_ON(!is_sync); - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); - - /* Update bic before losing reference to bfqq */ -@@ -4923,6 +4955,9 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - NULL); - else - bfqq_already_existing = true; -+ -+ BUG_ON(!bfqq); -+ BUG_ON(bfqq == &bfqd->oom_bfqq); - } - } - -@@ -4976,6 +5011,8 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - - BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); -+ - bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); - -@@ -5083,6 +5120,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_lock_irq(&bfqd->lock); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a5c8b4acd33c..85e59eeb3569 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1906,6 +1906,7 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - - if (bfqd->in_service_bic) { - #ifdef BFQ_MQ -+ BUG_ON(bfqd->ioc_to_put); - /* - * Schedule the release of a reference to - * bfqd->in_service_bic->icq.ioc to right after the - -From 3d54cb804f1db2e08ce4a6cc335868538542f587 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 22 Feb 2017 11:30:01 +0100 -Subject: [PATCH 18/51] Fix unbalanced increment of rq_in_driver - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++++--------- - 1 file changed, 43 insertions(+), 9 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 5707d42b160d..9cbcb8d43d81 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -3936,9 +3936,45 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ - bfq_log(bfqd, - "dispatch requests: picked %p from dispatch list", rq); -- goto exit; -+ bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ /* -+ * Increment counters here, because this -+ * dispatch does not follow the standard -+ * dispatch flow (where counters are -+ * incremented) -+ */ -+ bfqq->dispatched++; -+ -+ goto inc_in_driver_start_rq; -+ } -+ -+ /* -+ * We exploit the put_rq_private hook to decrement -+ * rq_in_driver, but put_rq_private will not be -+ * invoked on this request. So, to avoid unbalance, -+ * just start this request, without incrementing -+ * rq_in_driver. As a negative consequence, -+ * rq_in_driver is deceptively lower than it should be -+ * while this request is in service. This may cause -+ * bfq_schedule_dispatch to be invoked uselessly. -+ * -+ * As for implementing an exact solution, the -+ * put_request hook, if defined, is probably invoked -+ * also on this request. So, by exploiting this hook, -+ * we could 1) increment rq_in_driver here, and 2) -+ * decrement it in put_request. Such a solution would -+ * let the value of the counter be always accurate, -+ * but it would entail using an extra interface -+ * function. This cost seems higher than the benefit, -+ * being the frequency of non-elevator-private -+ * requests very low. -+ */ -+ goto start_rq; - } - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -@@ -3973,10 +4009,12 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); --exit: -+ - if (rq) { -- rq->rq_flags |= RQF_STARTED; -+ inc_in_driver_start_rq: - bfqd->rq_in_driver++; -+ start_rq: -+ rq->rq_flags |= RQF_STARTED; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "dispatched %s request %p, rq_in_driver %d", -@@ -3992,6 +4030,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - "returned NULL request, rq_in_driver %d", - bfqd->rq_in_driver); - -+exit: - return rq; - } - -@@ -4591,15 +4630,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); -- -- if (bfqq) -- bfqq->dispatched++; - } else { - __bfq_insert_request(bfqd, rq); - -@@ -4966,7 +5000,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - "get_request: new allocated %d", bfqq->allocated); - - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; - -From 7ba977d696b239569b4cd233aebc99e136ecf487 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 3 Mar 2017 09:39:35 +0100 -Subject: [PATCH 19/51] Add checks and extra log messages - Part III - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 9cbcb8d43d81..24b529a2edc7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4630,10 +4630,21 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "insert_request %p in disp: at_head %d", -+ rq, at_head); -+ else -+ bfq_log(bfqd, -+ "insert_request %p in disp: at_head %d", -+ rq, at_head); - } else { - __bfq_insert_request(bfqd, rq); - - -From c94e47b2908600b8ba89f84b0ac7febddd313141 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 17 Feb 2017 14:28:02 +0100 -Subject: [PATCH 20/51] TESTING: Check wrong invocation of merge and - put_rq_priv functions - -Check that merge functions are not invoked on requests queued in the -dispatch queue, and that neither put_rq_private is invoked on these -requests if, in addition, they have not passed through get_rq_private. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 22 ++++++++++++++++++++++ - include/linux/blkdev.h | 2 ++ - 2 files changed, 24 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 24b529a2edc7..b4d40bb712d2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1746,6 +1746,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - static void bfq_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) - { -+ BUG_ON(req->rq_flags & RQF_DISP_LIST); -+ - if (type == ELEVATOR_FRONT_MERGE && - rb_prev(&req->rb_node) && - blk_rq_pos(req) < -@@ -1795,6 +1797,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - BUG_ON(!RQ_BFQQ(rq)); - BUG_ON(!RQ_BFQQ(next)); -+ BUG_ON(rq->rq_flags & RQF_DISP_LIST); -+ BUG_ON(next->rq_flags & RQF_DISP_LIST); - - if (!RB_EMPTY_NODE(&rq->rb_node)) - goto end; -@@ -3936,6 +3940,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ rq->rq_flags &= ~RQF_DISP_LIST; - - bfq_log(bfqd, - "dispatch requests: picked %p from dispatch list", rq); -@@ -3950,6 +3955,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - */ - bfqq->dispatched++; - -+ /* -+ * TESTING: reset DISP_LIST flag, because: 1) -+ * this rq this request has passed through -+ * get_rq_private, 2) then it will have -+ * put_rq_private invoked on it, and 3) in -+ * put_rq_private we use this flag to check -+ * that put_rq_private is not invoked on -+ * requests for which get_rq_private has been -+ * invoked. -+ */ -+ rq->rq_flags &= ~RQF_DISP_LIST; - goto inc_in_driver_start_rq; - } - -@@ -4637,6 +4653,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); - -+ rq->rq_flags |= RQF_DISP_LIST; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "insert_request %p in disp: at_head %d", -@@ -4824,6 +4841,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - bfqd = bfqq->bfqd; - BUG_ON(!bfqd); - -+ if (rq->rq_flags & RQF_DISP_LIST) { -+ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); -+ BUG(); -+ } - BUG_ON(rq->rq_flags & RQF_QUEUED); - BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); - -@@ -5015,6 +5036,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -+ rq->rq_flags &= ~RQF_DISP_LIST; - - /* - * If a bfq_queue has only one process reference, it is owned -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 10f892ca585d..0048e59e6d07 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; - /* Look at ->special_vec for the actual data payload instead of the - bio chain. */ - #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -+/* DEBUG: rq in bfq-mq dispatch list */ -+#define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ - -From 49206f9052d13c96d49dbc36c612bed41b2d6552 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 25 Feb 2017 17:38:05 +0100 -Subject: [PATCH 21/51] Complete support for cgroups - -This commit completes cgroups support for bfq-mq. In particular, it deals with -a sort of circular dependency introduced in blk-mq: the function -blkcg_activate_policy, invoked during scheduler initialization, triggers the -invocation of the has_work scheduler hook (before the init function is -finished). To adress this issue, this commit moves the invocation of -blkcg_activate_policy after the initialization of all the fields that could be -initialized before invoking blkcg_activate_policy itself. This enables has_work -to correctly return false, and thus to prevent the blk-mq stack from invoking -further scheduler hooks before the init function is finished. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/Kconfig.iosched | 9 +++++ - block/bfq-mq-iosched.c | 108 ++++++++++++++++++++++++++++--------------------- - block/bfq-mq.h | 2 +- - 3 files changed, 72 insertions(+), 47 deletions(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 2d94af3d8b0a..299a6861fb90 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -106,6 +106,15 @@ config MQ_IOSCHED_BFQ - guarantees a low latency to interactive and soft real-time - applications. Details in Documentation/block/bfq-iosched.txt - -+config MQ_BFQ_GROUP_IOSCHED -+ bool "BFQ-MQ hierarchical scheduling support" -+ depends on MQ_IOSCHED_BFQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-MQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b4d40bb712d2..02a1e7fd0ea4 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -88,7 +88,6 @@ - #include "blk-mq.h" - #include "blk-mq-tag.h" - #include "blk-mq-sched.h" --#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq-mq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ -@@ -233,15 +232,6 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - return NULL; - } - --#define BFQ_MQ --#include "bfq-sched.c" --#include "bfq-cgroup-included.c" -- --#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) --#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -- --#define bfq_sample_valid(samples) ((samples) > 80) -- - /* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. -@@ -255,6 +245,43 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - } - - /* -+ * Next two functions release bfqd->lock and put the io context -+ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -+ * to take an ioc->lock while the scheduler lock is being held. -+ */ -+static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irq(&bfqd->lock); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -+ unsigned long flags) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+#define BFQ_MQ -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closesr to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. -@@ -4050,34 +4077,6 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - --/* -- * Next two functions release bfqd->lock and put the io context -- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -- * to take an ioc->lock while the scheduler lock is being held. -- */ --static void bfq_unlock_put_ioc(struct bfq_data *bfqd) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irq(&bfqd->lock); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- --static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -- unsigned long flags) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irqrestore(&bfqd->lock, flags); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -@@ -5239,6 +5238,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - } - eq->elevator_data = bfqd; - -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow -@@ -5261,12 +5264,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->oom_bfqq.entity.prio_changed = 1; - - bfqd->queue = q; -- -- bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -- if (!bfqd->root_group) -- goto out_free; -- bfq_init_root_group(bfqd->root_group, bfqd); -- bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ INIT_LIST_HEAD(&bfqd->dispatch); - - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); -@@ -5324,9 +5322,27 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->device_speed = BFQ_BFQD_FAST; - - spin_lock_init(&bfqd->lock); -- INIT_LIST_HEAD(&bfqd->dispatch); - -- q->elevator = eq; -+ /* -+ * The invocation of the next bfq_create_group_hierarchy -+ * function is the head of a chain of function calls -+ * (bfq_create_group_hierarchy->blkcg_activate_policy-> -+ * blk_mq_freeze_queue) that may lead to the invocation of the -+ * has_work hook function. For this reason, -+ * bfq_create_group_hierarchy is invoked only after all -+ * scheduler data has been initialized, apart from the fields -+ * that can be initialized only after invoking -+ * bfq_create_group_hierarchy. This, in particular, enables -+ * has_work to correctly return false. Of course, to avoid -+ * other inconsistencies, the blk-mq stack must then refrain -+ * from invoking further scheduler hooks before this init -+ * function is finished. -+ */ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - - return 0; - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index bd83f1c02573..2c81c02bccc4 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -20,7 +20,7 @@ - #include <linux/blk-cgroup.h> - - /* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ --#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED -+#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED - #define BFQ_GROUP_IOSCHED_ENABLED - #endif - - -From 62d12db23ce14d2716b5cff7d2635fbc817b96d0 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 17 Mar 2017 06:15:18 +0100 -Subject: [PATCH 22/51] Remove all get and put of I/O contexts - -When a bfq queue is set in service and when it is merged, a reference -to the I/O context associated with the queue is taken. This reference -is then released when the queue is deselected from service or -split. More precisely, the release of the reference is postponed to -when the scheduler lock is released, to avoid nesting between the -scheduler and the I/O-context lock. In fact, such nesting would lead -to deadlocks, because of other code paths that take the same locks in -the opposite order. This postponing of I/O-context releases does -complicate code. - -This commit addresses this issue by modifying involved operations in -such a way to not need to get the above I/O-context references any -more. Then it also removes any get and release of these references. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 2 +- - block/bfq-mq-iosched.c | 127 ++++++++------------------------------------ - block/bfq-mq.h | 11 ---- - block/bfq-sched.c | 17 ------ - 4 files changed, 22 insertions(+), 135 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index cf59eeb7f08e..dfacca799b5e 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -773,7 +773,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - bfq_put_async_queues(bfqd, bfqg); - - #ifdef BFQ_MQ -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - #endif - /* - * @blkg is going offline and will be ignored by -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 02a1e7fd0ea4..8e7589d3280f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -244,34 +244,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - } - } - --/* -- * Next two functions release bfqd->lock and put the io context -- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -- * to take an ioc->lock while the scheduler lock is being held. -- */ --static void bfq_unlock_put_ioc(struct bfq_data *bfqd) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irq(&bfqd->lock); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- --static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -- unsigned long flags) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irqrestore(&bfqd->lock, flags); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- - #define BFQ_MQ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" -@@ -1747,7 +1719,6 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - if (free) - blk_mq_free_request(free); - bfqd->bio_bfqq_set = false; -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -1812,7 +1783,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - } - } -@@ -1858,7 +1828,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - bfq_remove_request(q, next); - -- BUG_ON(bfqq->bfqd->ioc_to_put); - spin_unlock_irq(&bfqq->bfqd->lock); - end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -@@ -2035,20 +2004,18 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - * first time that the requests of some process are redirected to - * it. - * -- * We redirect bfqq to new_bfqq and not the opposite, because we -- * are in the context of the process owning bfqq, hence we have -- * the io_cq of this process. So we can immediately configure this -- * io_cq to redirect the requests of the process to new_bfqq. -+ * We redirect bfqq to new_bfqq and not the opposite, because -+ * we are in the context of the process owning bfqq, thus we -+ * have the io_cq of this process. So we can immediately -+ * configure this io_cq to redirect the requests of the -+ * process to new_bfqq. In contrast, the io_cq of new_bfqq is -+ * not available any more (new_bfqq->bic == NULL). - * -- * NOTE, even if new_bfqq coincides with the in-service queue, the -- * io_cq of new_bfqq is not available, because, if the in-service -- * queue is shared, bfqd->in_service_bic may not point to the -- * io_cq of the in-service queue. -- * Redirecting the requests of the process owning bfqq to the -- * currently in-service queue is in any case the best option, as -- * we feed the in-service queue with new requests close to the -- * last request served and, by doing so, hopefully increase the -- * throughput. -+ * Anyway, even in case new_bfqq coincides with the in-service -+ * queue, redirecting requests the in-service queue is the -+ * best option, as we feed the in-service queue with new -+ * requests close to the last request served and, by doing so, -+ * are likely to increase the throughput. - */ - bfqq->new_bfqq = new_bfqq; - new_bfqq->ref += process_refs; -@@ -2147,13 +2114,13 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ wr_from_too_long(in_service_bfqq) - && likely(in_service_bfqq == &bfqd->oom_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have tried merge with in-service-queue, but wr"); - -- if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ if (!in_service_bfqq || in_service_bfqq == bfqq -+ || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - -@@ -2214,16 +2181,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - --static void bfq_get_bic_reference(struct bfq_queue *bfqq) --{ -- /* -- * If bfqq->bic has a non-NULL value, the bic to which it belongs -- * is about to begin using a shared bfq_queue. -- */ -- if (bfqq->bic) -- atomic_long_inc(&bfqq->bic->icq.ioc->refcount); --} -- - static void - bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -@@ -2280,12 +2237,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfqd->wr_busy_queues); - - /* -- * Grab a reference to the bic, to prevent it from being destroyed -- * before being possibly touched by a bfq_split_bfqq(). -- */ -- bfq_get_bic_reference(bfqq); -- bfq_get_bic_reference(new_bfqq); -- /* - * Merge queues (that is, let bic redirect its requests to new_bfqq) - */ - bic_set_bfqq(bic, new_bfqq, 1); -@@ -2472,16 +2423,10 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) - static void bfq_arm_slice_timer(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq = bfqd->in_service_queue; -- struct bfq_io_cq *bic; - u32 sl; - - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - -- /* Processes have exited, don't wait. */ -- bic = bfqd->in_service_bic; -- if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -- return; -- - bfq_mark_bfqq_wait_request(bfqq); - - /* -@@ -3922,11 +3867,6 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, - bfq_bfqq_budget_left(bfqq), - bfqq->dispatched); - -- if (!bfqd->in_service_bic) { -- atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -- bfqd->in_service_bic = RQ_BIC(rq); -- } -- - /* - * Expire bfqq, pretending that its budget expired, if bfqq - * belongs to CLASS_IDLE and other queues are waiting for -@@ -4085,7 +4025,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - spin_lock_irq(&bfqd->lock); - - rq = __bfq_dispatch_request(hctx); -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - - return rq; - } -@@ -4184,21 +4124,10 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); -- /* -- * If the bic is using a shared queue, put the -- * reference taken on the io_context when the bic -- * started using a shared bfq_queue. This put cannot -- * make ioc->ref_count reach 0, then no ioc->lock -- * risks to be taken (leading to possible deadlock -- * scenarios). -- */ -- if (is_sync && bfq_bfqq_coop(bfqq)) -- put_io_context(bic->icq.ioc); - - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - } - } - -@@ -4633,12 +4562,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - return; - } - -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4671,7 +4598,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - } - } - -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -@@ -4864,12 +4791,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); - - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); - -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -@@ -4992,7 +4918,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); -- BUG_ON(bfqd->ioc_to_put); - - if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ -@@ -5005,14 +4930,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -- /* -- * A reference to bic->icq.ioc needs to be -- * released after a queue split. Do not do it -- * immediately, to not risk to possibly take -- * an ioc->lock while holding the scheduler -- * lock. -- */ -- bfqd->ioc_to_put = bic->icq.ioc; - - if (!bfqq) - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -@@ -5045,7 +4962,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; -- if (bfqd->ioc_to_put) { /* if true, then there has been a split */ -+ if (split) { - /* - * The queue has just been split from a shared - * queue: restore the idle window and the -@@ -5059,7 +4976,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - - return 0; - -@@ -5077,7 +4994,6 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - - BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); - - bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); -@@ -5108,7 +5024,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - bfq_bfqq_expire(bfqd, bfqq, true, reason); - - schedule_dispatch: -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - bfq_schedule_dispatch(bfqd); - } - -@@ -5186,7 +5102,6 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_lock_irq(&bfqd->lock); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 2c81c02bccc4..36ee24a87dda 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -458,8 +458,6 @@ struct bfq_data { - - /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -- /* bfq_io_cq (bic) associated with the @in_service_queue */ -- struct bfq_io_cq *in_service_bic; - - /* on-disk position of the last served request */ - sector_t last_position; -@@ -621,15 +619,6 @@ struct bfq_data { - struct bfq_queue *bio_bfqq; - /* Extra flag used only for TESTING */ - bool bio_bfqq_set; -- -- /* -- * io context to put right after bfqd->lock is released. This -- * filed is used to perform put_io_context, when needed, to -- * after the scheduler lock has been released, and thus -- * prevent an ioc->lock from being possibly taken while the -- * scheduler lock is being held. -- */ -- struct io_context *ioc_to_put; - }; - - enum bfqq_state_flags { -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 85e59eeb3569..9c4e6797d8c9 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1904,23 +1904,6 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; - -- if (bfqd->in_service_bic) { --#ifdef BFQ_MQ -- BUG_ON(bfqd->ioc_to_put); -- /* -- * Schedule the release of a reference to -- * bfqd->in_service_bic->icq.ioc to right after the -- * scheduler lock is released. This ioc is not -- * released immediately, to not risk to possibly take -- * an ioc->lock while holding the scheduler lock. -- */ -- bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; --#else -- put_io_context(bfqd->in_service_bic->icq.ioc); --#endif -- bfqd->in_service_bic = NULL; -- } -- - bfq_clear_bfqq_wait_request(in_serv_bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; - -From 1521ad11f8684cf0a1b7249249cd406fee50da6d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 29 Mar 2017 18:41:46 +0200 -Subject: [PATCH 23/51] BUGFIX: Remove unneeded and deadlock-causing lock in - request_merged - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 -- - 1 file changed, 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8e7589d3280f..bb046335ff4f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1761,7 +1761,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - BUG_ON(RQ_BFQQ(req) != bfqq); - elv_rb_add(&bfqq->sort_list, req); - -- spin_lock_irq(&bfqd->lock); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -@@ -1783,7 +1782,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -- spin_unlock_irq(&bfqd->lock); - } - } - - -From 9136b4c953918ea937254c57cfb787b55b5bc2c6 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 29 Mar 2017 18:55:30 +0200 -Subject: [PATCH 24/51] Fix wrong unlikely - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bb046335ff4f..3ae9bd424b3f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4917,7 +4917,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); - -- if (unlikely(!new_queue)) { -+ if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - BUG_ON(!is_sync); - -From 8e05f722f19645f2278f6962368ca3b7c2a81c9c Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 12 May 2017 09:51:18 +0200 -Subject: [PATCH 25/51] Change cgroup params prefix to bfq-mq for bfq-mq - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 54 ++++++++++++++++++++++++++------------------- - 1 file changed, 31 insertions(+), 23 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index dfacca799b5e..9e9b0a09e26f 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -995,9 +995,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - return blkg_to_bfqg(bfqd->queue->root_blkg); - } - -+#ifdef BFQ_MQ -+#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param -+#else -+#define BFQ_CGROUP_FNAME(param) "bfq."#param -+#endif -+ - static struct cftype bfq_blkcg_legacy_files[] = { - { -- .name = "bfq.weight", -+ .name = BFQ_CGROUP_FNAME(weight), - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write_u64 = bfq_io_set_weight_legacy, -@@ -1005,106 +1011,106 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - /* statistics, covers only the tasks in the bfqg */ - { -- .name = "bfq.time", -+ .name = BFQ_CGROUP_FNAME(time), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.sectors", -+ .name = BFQ_CGROUP_FNAME(sectors), - .seq_show = bfqg_print_stat_sectors, - }, - { -- .name = "bfq.io_service_bytes", -+ .name = BFQ_CGROUP_FNAME(io_service_bytes), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, - }, - { -- .name = "bfq.io_serviced", -+ .name = BFQ_CGROUP_FNAME(io_serviced), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, - }, - { -- .name = "bfq.io_service_time", -+ .name = BFQ_CGROUP_FNAME(io_service_time), - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_wait_time", -+ .name = BFQ_CGROUP_FNAME(io_wait_time), - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_merged", -+ .name = BFQ_CGROUP_FNAME(io_merged), - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_queued", -+ .name = BFQ_CGROUP_FNAME(io_queued), - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, - - /* the same statictics which cover the bfqg and its descendants */ - { -- .name = "bfq.time_recursive", -+ .name = BFQ_CGROUP_FNAME(time_recursive), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { -- .name = "bfq.sectors_recursive", -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), - .seq_show = bfqg_print_stat_sectors_recursive, - }, - { -- .name = "bfq.io_service_bytes_recursive", -+ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, - }, - { -- .name = "bfq.io_serviced_recursive", -+ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, - }, - { -- .name = "bfq.io_service_time_recursive", -+ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_wait_time_recursive", -+ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_merged_recursive", -+ .name = BFQ_CGROUP_FNAME(io_merged_recursive), - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_queued_recursive", -+ .name = BFQ_CGROUP_FNAME(io_queued_recursive), - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.avg_queue_size", -+ .name = BFQ_CGROUP_FNAME(avg_queue_size), - .seq_show = bfqg_print_avg_queue_size, - }, - { -- .name = "bfq.group_wait_time", -+ .name = BFQ_CGROUP_FNAME(group_wait_time), - .private = offsetof(struct bfq_group, stats.group_wait_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.idle_time", -+ .name = BFQ_CGROUP_FNAME(idle_time), - .private = offsetof(struct bfq_group, stats.idle_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.empty_time", -+ .name = BFQ_CGROUP_FNAME(empty_time), - .private = offsetof(struct bfq_group, stats.empty_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.dequeue", -+ .name = BFQ_CGROUP_FNAME(dequeue), - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -@@ -1113,7 +1119,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - static struct cftype bfq_blkg_files[] = { - { -- .name = "bfq.weight", -+ .name = BFQ_CGROUP_FNAME(weight), - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write = bfq_io_set_weight, -@@ -1121,6 +1127,8 @@ static struct cftype bfq_blkg_files[] = { - {} /* terminate */ - }; - -+#undef BFQ_CGROUP_FNAME -+ - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - -From abdf7565dadbb00e78be5f4fb2cc9b157649840e Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 12 May 2017 11:56:13 +0200 -Subject: [PATCH 26/51] Add tentative extra tests on groups, reqs and queues - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 1 + - block/bfq-mq-iosched.c | 5 +++++ - include/linux/blkdev.h | 2 ++ - 3 files changed, 8 insertions(+) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 9e9b0a09e26f..72107ad12220 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -412,6 +412,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - BUG_ON(!blkg); - bfqg = blkg_to_bfqg(blkg); - bfqd = blkg->q->elevator->elevator_data; -+ BUG_ON(bfqg == bfqd->root_group); - entity = &bfqg->entity; - d = blkcg_to_bfqgd(blkg->blkcg); - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 3ae9bd424b3f..a9e3406fef06 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4494,6 +4494,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ BUG_ON(!bfqq); - - assert_spin_locked(&bfqd->lock); - -@@ -4587,6 +4588,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - "insert_request %p in disp: at_head %d", - rq, at_head); - } else { -+ BUG_ON(!(rq->rq_flags & RQF_GOT)); -+ rq->rq_flags &= ~RQF_GOT; -+ - __bfq_insert_request(bfqd, rq); - - if (rq_mergeable(rq)) { -@@ -4974,6 +4978,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -+ rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); - - return 0; -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 0048e59e6d07..9ae814743095 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -123,6 +123,8 @@ typedef __u32 __bitwise req_flags_t; - #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) - /* DEBUG: rq in bfq-mq dispatch list */ - #define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) -+/* DEBUG: rq had get_rq_private executed on it */ -+#define RQF_GOT ((__force req_flags_t)(1 << 20)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ - -From 9e1c1514bc947c4e04502331372b1cc58459d8d1 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 15 May 2017 22:25:03 +0200 -Subject: [PATCH 27/51] block, bfq-mq: access and cache blkg data only when - safe - -In blk-cgroup, operations on blkg objects are protected with the -request_queue lock. This is no more the lock that protects -I/O-scheduler operations in blk-mq. In fact, the latter are now -protected with a finer-grained per-scheduler-instance lock. As a -consequence, although blkg lookups are also rcu-protected, blk-mq I/O -schedulers may see inconsistent data when they access blkg and -blkg-related objects. BFQ does access these objects, and does incur -this problem, in the following case. - -The blkg_lookup performed in bfq_get_queue, being protected (only) -through rcu, may happen to return the address of a copy of the -original blkg. If this is the case, then the blkg_get performed in -bfq_get_queue, to pin down the blkg, is useless: it does not prevent -blk-cgroup code from destroying both the original blkg and all objects -directly or indirectly referred by the copy of the blkg. BFQ accesses -these objects, which typically causes a crash for NULL-pointer -dereference of memory-protection violation. - -Some additional protection mechanism should be added to blk-cgroup to -address this issue. In the meantime, this commit provides a quick -temporary fix for BFQ: cache (when safe) blkg data that might -disappear right after a blkg_lookup. - -In particular, this commit exploits the following facts to achieve its -goal without introducing further locks. Destroy operations on a blkg -invoke, as a first step, hooks of the scheduler associated with the -blkg. And these hooks are executed with bfqd->lock held for BFQ. As a -consequence, for any blkg associated with the request queue an -instance of BFQ is attached to, we are guaranteed that such a blkg is -not destroyed, and that all the pointers it contains are consistent, -while that instance is holding its bfqd->lock. A blkg_lookup performed -with bfqd->lock held then returns a fully consistent blkg, which -remains consistent until this lock is held. In more detail, this holds -even if the returned blkg is a copy of the original one. - -Finally, also the object describing a group inside BFQ needs to be -protected from destruction on the blkg_free of the original blkg -(which invokes bfq_pd_free). This commit adds private refcounting for -this object, to let it disappear only after no bfq_queue refers to it -any longer. - -This commit also removes or updates some stale comments on locking -issues related to blk-cgroup operations. - -Reported-by: Tomas Konir <tomas.konir@gmail.com> -Reported-by: Lee Tibbert <lee.tibbert@gmail.com> -Reported-by: Marco Piazza <mpiazza@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Tomas Konir <tomas.konir@gmail.com> -Tested-by: Lee Tibbert <lee.tibbert@gmail.com> -Tested-by: Marco Piazza <mpiazza@gmail.com> ---- - block/bfq-cgroup-included.c | 149 ++++++++++++++++++++++++++++++++++++++++---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-mq.h | 26 +++----- - 3 files changed, 148 insertions(+), 29 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 72107ad12220..d903393ee78a 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -43,7 +43,11 @@ BFQG_FLAG_FNS(idling) - BFQG_FLAG_FNS(empty) - #undef BFQG_FLAG_FNS - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) - { - unsigned long long now; -@@ -58,7 +62,11 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) - bfqg_stats_clear_waiting(stats); - } - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) - { -@@ -72,7 +80,11 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - bfqg_stats_mark_waiting(stats); - } - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) - { - unsigned long long now; -@@ -198,14 +210,43 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -- return blkg_get(bfqg_to_blkg(bfqg)); -+#ifdef BFQ_MQ -+ bfqg->ref++; -+#else -+ blkg_get(bfqg_to_blkg(bfqg)); -+#endif - } - - static void bfqg_put(struct bfq_group *bfqg) - { -- return blkg_put(bfqg_to_blkg(bfqg)); -+#ifdef BFQ_MQ -+ bfqg->ref--; -+ -+ BUG_ON(bfqg->ref < 0); -+ if (bfqg->ref == 0) -+ kfree(bfqg); -+#else -+ blkg_put(bfqg_to_blkg(bfqg)); -+#endif -+} -+ -+#ifdef BFQ_MQ -+static void bfqg_and_blkg_get(struct bfq_group *bfqg) -+{ -+ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ -+ bfqg_get(bfqg); -+ -+ blkg_get(bfqg_to_blkg(bfqg)); - } - -+static void bfqg_and_blkg_put(struct bfq_group *bfqg) -+{ -+ bfqg_put(bfqg); -+ -+ blkg_put(bfqg_to_blkg(bfqg)); -+} -+#endif -+ - static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, - unsigned int op) -@@ -310,7 +351,15 @@ static void bfq_init_entity(struct bfq_entity *entity, - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; -+#ifdef BFQ_MQ -+ /* -+ * Make sure that bfqg and its associated blkg do not -+ * disappear before entity. -+ */ -+ bfqg_and_blkg_get(bfqg); -+#else - bfqg_get(bfqg); -+#endif - } - entity->parent = bfqg->my_entity; /* NULL for root group */ - entity->sched_data = &bfqg->sched_data; -@@ -397,6 +446,10 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - return NULL; - } - -+#ifdef BFQ_MQ -+ /* see comments in bfq_bic_update_cgroup for why refcounting */ -+ bfqg_get(bfqg); -+#endif - return &bfqg->pd; - } - -@@ -432,7 +485,11 @@ static void bfq_pd_free(struct blkg_policy_data *pd) - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_exit(&bfqg->stats); -- return kfree(bfqg); -+#ifdef BFQ_MQ -+ bfqg_put(bfqg); -+#else -+ kfree(bfqg); -+#endif - } - - static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -@@ -516,9 +573,16 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating - * it on the new one. Avoid putting the entity on the old group idle tree. - * -+#ifdef BFQ_MQ -+ * Must be called under the scheduler lock, to make sure that the blkg -+ * owning @bfqg does not disappear (see comments in -+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg -+ * objects). -+#else - * Must be called under the queue lock; the cgroup owning @bfqg must - * not disappear (by now this just means that we are called under - * rcu_read_lock()). -+#endif - */ - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) -@@ -555,16 +619,20 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->tree); - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } -+#ifdef BFQ_MQ -+ bfqg_and_blkg_put(bfqq_group(bfqq)); -+#else - bfqg_put(bfqq_group(bfqq)); -+#endif - -- /* -- * Here we use a reference to bfqg. We don't need a refcounter -- * as the cgroup reference will not be dropped, so that its -- * destroy() callback will not be invoked. -- */ - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; -+#ifdef BFQ_MQ -+ /* pin down bfqg and its associated blkg */ -+ bfqg_and_blkg_get(bfqg); -+#else - bfqg_get(bfqg); -+#endif - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_busy(bfqq)) { -@@ -585,8 +653,14 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. - * -+#ifdef BFQ_MQ -+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes -+ * sure that the reference to cgroup is valid across the call (see -+ * comments in bfq_bic_update_cgroup on this issue) -+#else - * Move bic to blkcg, assuming that bfqd->queue is locked; the caller - * has to make sure that the reference to cgroup is valid across the call. -+#endif - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup -@@ -645,6 +719,59 @@ static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) - goto out; - - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+#ifdef BFQ_MQ -+ /* -+ * Update blkg_path for bfq_log_* functions. We cache this -+ * path, and update it here, for the following -+ * reasons. Operations on blkg objects in blk-cgroup are -+ * protected with the request_queue lock, and not with the -+ * lock that protects the instances of this scheduler -+ * (bfqd->lock). This exposes BFQ to the following sort of -+ * race. -+ * -+ * The blkg_lookup performed in bfq_get_queue, protected -+ * through rcu, may happen to return the address of a copy of -+ * the original blkg. If this is the case, then the -+ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down -+ * the blkg, is useless: it does not prevent blk-cgroup code -+ * from destroying both the original blkg and all objects -+ * directly or indirectly referred by the copy of the -+ * blkg. -+ * -+ * On the bright side, destroy operations on a blkg invoke, as -+ * a first step, hooks of the scheduler associated with the -+ * blkg. And these hooks are executed with bfqd->lock held for -+ * BFQ. As a consequence, for any blkg associated with the -+ * request queue this instance of the scheduler is attached -+ * to, we are guaranteed that such a blkg is not destroyed, and -+ * that all the pointers it contains are consistent, while we -+ * are holding bfqd->lock. A blkg_lookup performed with -+ * bfqd->lock held then returns a fully consistent blkg, which -+ * remains consistent until this lock is held. -+ * -+ * Thanks to the last fact, and to the fact that: (1) bfqg has -+ * been obtained through a blkg_lookup in the above -+ * assignment, and (2) bfqd->lock is being held, here we can -+ * safely use the policy data for the involved blkg (i.e., the -+ * field bfqg->pd) to get to the blkg associated with bfqg, -+ * and then we can safely use any field of blkg. After we -+ * release bfqd->lock, even just getting blkg through this -+ * bfqg may cause dangling references to be traversed, as -+ * bfqg->pd may not exist any more. -+ * -+ * In view of the above facts, here we cache, in the bfqg, any -+ * blkg data we may need for this bic, and for its associated -+ * bfq_queue. As of now, we need to cache only the path of the -+ * blkg, which is used in the bfq_log_* functions. -+ * -+ * Finally, note that bfqg itself needs to be protected from -+ * destruction on the blkg_free of the original blkg (which -+ * invokes bfq_pd_free). We use an additional private -+ * refcounter for bfqg, to let it disappear only after no -+ * bfq_queue refers to it any longer. -+ */ -+ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); -+#endif - bic->blkcg_serial_nr = serial_nr; - out: - rcu_read_unlock(); -@@ -682,8 +809,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - * @bfqd: the device data structure with the root group. - * @bfqg: the group to move from. - * @st: the service tree with the entities. -- * -- * Needs queue_lock to be taken and reference to be valid over the call. - */ - static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, -@@ -736,6 +861,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - #ifdef BFQ_MQ - spin_lock_irqsave(&bfqd->lock, flags); - #endif -+ - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -746,8 +872,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different -- * cgroup from the one being destroyed now. No one else -- * can access them so it's safe to act without any lock. -+ * cgroup from the one being destroyed now. - */ - bfq_flush_idle_tree(st); - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index a9e3406fef06..4eb668eeacdc 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4073,7 +4073,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- bfqg_put(bfqg); -+ bfqg_and_blkg_put(bfqg); - #endif - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 36ee24a87dda..77ab0f22ed22 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -695,23 +695,17 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s bfq%d%c %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ bfqq_group(bfqq)->blkg_path, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- __pbuf, ##args); \ -+ bfqg->blkg_path, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ -@@ -736,20 +730,14 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ bfqq_group(bfqq)->blkg_path, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ -@@ -860,6 +848,12 @@ struct bfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - -+ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ -+ char blkg_path[128]; -+ -+ /* reference counter (see comments in bfq_bic_update_cgroup) */ -+ int ref; -+ - struct bfq_entity entity; - struct bfq_sched_data sched_data; - - -From c9137b749aceef6c2dde88e99b2fc978d5952e76 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 17 Jun 2017 11:18:11 +0200 -Subject: [PATCH 28/51] bfq-mq: fix macro name in conditional invocation of - policy_unregister - -This commit fixes the name of the macro in the conditional group that -invokes blkcg_policy_unregister in bfq_exit for bfq-mq. Because of -this error, blkcg_policy_unregister was never invoked. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 4eb668eeacdc..bc1de3f70ea8 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5669,7 +5669,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq_mq); --#ifdef CONFIG_BFQ_GROUP_ENABLED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); - -From c7ceb37496f63b2dba4d06946ab85ec97b87bfb5 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 11:48:17 +0200 -Subject: [PATCH 29/51] Port of "blk-mq-sched: unify request finished methods" - -No need to have two different callouts of bfq vs kyber. - -Signed-off-by: Christoph Hellwig <hch@lst.de> -Signed-off-by: Jens Axboe <axboe@kernel.dk> ---- - block/bfq-mq-iosched.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bc1de3f70ea8..2598602a0b10 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4753,7 +4753,7 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - bfq_put_queue(bfqq); - } - --static void bfq_put_rq_private(struct request_queue *q, struct request *rq) -+static void bfq_finish_request(struct request *rq) - { - struct bfq_queue *bfqq; - struct bfq_data *bfqd; -@@ -4814,7 +4814,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - - assert_spin_locked(&bfqd->lock); - if (!RB_EMPTY_NODE(&rq->rb_node)) -- bfq_remove_request(q, rq); -+ bfq_remove_request(rq->q, rq); - bfq_put_rq_priv_body(bfqq); - } - -@@ -5558,7 +5558,7 @@ static struct elv_fs_entry bfq_attrs[] = { - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .get_rq_priv = bfq_get_rq_private, -- .put_rq_priv = bfq_put_rq_private, -+ .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, - -From 12bef026fe114ab5e2e284772ddc52a8be83fdbc Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 11:54:57 +0200 -Subject: [PATCH 30/51] Port of "bfq-iosched: fix NULL ioc check in - bfq_get_rq_private" - -icq_to_bic is a container_of operation, so we need to check for NULL -before it. Also move the check outside the spinlock while we're at -it. - -Signed-off-by: Christoph Hellwig <hch@lst.de> -Signed-off-by: Jens Axboe <axboe@kernel.dk> ---- - block/bfq-mq-iosched.c | 15 +++++---------- - 1 file changed, 5 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 2598602a0b10..c57774a60911 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4903,16 +4903,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ struct bfq_io_cq *bic; - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - bool bfqq_already_existing = false, split = false; - bool new_queue = false; - -- spin_lock_irq(&bfqd->lock); -+ if (!rq->elv.icq) -+ return 1; -+ bic = icq_to_bic(rq->elv.icq); - -- if (!bic) -- goto queue_fail; -+ spin_lock_irq(&bfqd->lock); - - bfq_check_ioprio_change(bic, bio); - -@@ -4980,13 +4981,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); -- - return 0; -- --queue_fail: -- spin_unlock_irq(&bfqd->lock); -- -- return 1; - } - - static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - -From 633e5711347df1bf4ca935fd0aa9118a0054f75d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 12:02:16 +0200 -Subject: [PATCH 31/51] Port of "blk-mq-sched: unify request prepare methods" - -This patch makes sure we always allocate requests in the core blk-mq -code and use a common prepare_request method to initialize them for -both mq I/O schedulers. For Kyber and additional limit_depth method -is added that is called before allocating the request. - -Also because none of the intializations can really fail the new method -does not return an error - instead the bfq finish method is hardened -to deal with the no-IOC case. - -Last but not least this removes the abuse of RQF_QUEUE by the blk-mq -scheduling code as RQF_ELFPRIV is all that is needed now. - -Signed-off-by: Christoph Hellwig <hch@lst.de> -Signed-off-by: Jens Axboe <axboe@kernel.dk> ---- - block/bfq-mq-iosched.c | 13 ++++++++----- - 1 file changed, 8 insertions(+), 5 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index c57774a60911..49ffca1ad6e7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4760,6 +4760,10 @@ static void bfq_finish_request(struct request *rq) - struct bfq_io_cq *bic; - - BUG_ON(!rq); -+ -+ if (!rq->elv.icq) -+ return; -+ - bfqq = RQ_BFQQ(rq); - BUG_ON(!bfqq); - -@@ -4899,9 +4903,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - /* - * Allocate bfq data structures associated with this request. - */ --static int bfq_get_rq_private(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static void bfq_prepare_request(struct request *rq, struct bio *bio) - { -+ struct request_queue *q = rq->q; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic; - const int is_sync = rq_is_sync(rq); -@@ -4910,7 +4914,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bool new_queue = false; - - if (!rq->elv.icq) -- return 1; -+ return; - bic = icq_to_bic(rq->elv.icq); - - spin_lock_irq(&bfqd->lock); -@@ -4981,7 +4985,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); -- return 0; - } - - static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) -@@ -5552,7 +5555,7 @@ static struct elv_fs_entry bfq_attrs[] = { - - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { -- .get_rq_priv = bfq_get_rq_private, -+ .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - -From 5a321acfce282c3e58ac63582faf6f928ad17f27 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 12:43:22 +0200 -Subject: [PATCH 32/51] Add list of bfq instances to documentation - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 11 ++++++++++- - 1 file changed, 10 insertions(+), 1 deletion(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 3d6951d63489..8ce6b9a9bacd 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -11,6 +11,15 @@ controllers), BFQ's main features are: - groups (switching back to time distribution when needed to keep - throughput high). - -+If bfq-mq patches have been applied, then the following three -+instances of BFQ are available (otherwise only the first instance): -+- bfq: mainline version of BFQ, for blk-mq -+- bfq-mq: development version of BFQ for blk-mq; this version contains -+ also all latest features not yet landed in mainline, plus many -+ safety checks -+- bfq: BFQ for legacy blk; also this version contains both latest -+ features and safety checks -+ - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - schedules that may lead to a lower throughput. If your main or only -@@ -27,7 +36,7 @@ sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and - to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on - multi-queue devices too. - --The table of contents follow. Impatients can just jump to Section 3. -+The table of contents follows. Impatients can just jump to Section 3. - - CONTENTS - - -From 9f2e5b27227fd9254cc258572dc2d4531838c30b Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 16:28:00 +0200 -Subject: [PATCH 33/51] bfq-sq: fix prefix of names of cgroups parameters - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 12 +++++++----- - block/bfq-cgroup-included.c | 2 +- - 2 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 8ce6b9a9bacd..965d82f94db9 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -503,10 +503,12 @@ To get proportional sharing of bandwidth with BFQ for a given device, - BFQ must of course be the active scheduler for that device. - - Within each group directory, the names of the files associated with --BFQ-specific cgroup parameters and stats begin with the "bfq." --prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for --BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group --parameter to set the weight of a group with BFQ is blkio.bfq.weight -+BFQ-specific cgroup parameters and stats begin with the "bfq.", -+"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you -+want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for -+BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" -+(i.e., null string), "-sq" or "-mq". For example, the group parameter -+to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - - Parameters to set -@@ -514,7 +516,7 @@ Parameters to set - - For each group, there is only the following parameter to set. - --weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -+weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the - group inside its parent. Available values: 1..10000 (default 100). The - linear mapping between ioprio and weights, described at the beginning - of the tunable section, is still valid, but all weights higher than -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index d903393ee78a..631e53d9150d 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -1124,7 +1124,7 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - #ifdef BFQ_MQ - #define BFQ_CGROUP_FNAME(param) "bfq-mq."#param - #else --#define BFQ_CGROUP_FNAME(param) "bfq."#param -+#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param - #endif - - static struct cftype bfq_blkcg_legacy_files[] = { - -From 92b42df8166939ccf26aa450125b5b575cf6d505 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 5 Jul 2017 21:08:32 +0200 -Subject: [PATCH 34/51] Add to documentation that bfq-mq and bfq-sq contain - last fixes too - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 965d82f94db9..0e59f1c9d30e 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -15,10 +15,10 @@ If bfq-mq patches have been applied, then the following three - instances of BFQ are available (otherwise only the first instance): - - bfq: mainline version of BFQ, for blk-mq - - bfq-mq: development version of BFQ for blk-mq; this version contains -- also all latest features not yet landed in mainline, plus many -+ also all latest features and fixes not yet landed in mainline, plus many - safety checks --- bfq: BFQ for legacy blk; also this version contains both latest -- features and safety checks -+- bfq: BFQ for legacy blk; also this version contains latest features -+ and fixes, as well as safety checks - - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - -From 7f9bdd433b848d4f53c167258bf4d0b3f1ae1923 Mon Sep 17 00:00:00 2001 -From: Lee Tibbert <lee.tibbert@gmail.com> -Date: Wed, 19 Jul 2017 10:28:32 -0400 -Subject: [PATCH 35/51] Improve most frequently used no-logging path - -This patch originated as a fix for compiler unused-variable warnings -issued when compiling bfq-mq with logging disabled (both -CONFIG_BLK_DEV_IO_TRACE and CONFIG_BFQ_REDIRECT_TO_CONSOLE -undefined). - -It turns out to also have benefits for the bfq-sq path as well. - -In most performance sensitive production builds blktrace_api logging -will probably be turned off, so it is worth making the no-logging path -compile without warnings. Any performance benefit is a bonus. - -Thank you to T. B. on the bfq-iosched@googlegroups.com list -for ((void) (bfqq)) simplification/suggestion/improvement. All bugs -and unclear descriptions are my own doing. - -The discussion below is based on the gcc compiler with optimization -level of at least 02. Lower optimization levels are unlikely to -remove no-op instruction equivalents. - -Provide three improvements in this likely case. - - 1) Fix multiple occurrences of an unused-variable warning - issued when compiling bfq-mq with no logging. The warning - occurred each time the bfq_log_bfqg macro was expanded inside - a code block such as the following snippet from - block/bfq-sched.c, line 139 and few following, lightly edited for - indentation in order to pass checkpatch.pl maximum line lengths. - -else { - struct bfq_group *bfqg = - container_of(next_in_service, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "update_next_in_service: chosen this entity"); - } - - Previously bfq-mq.h expanded bfq_log_bfqg to blk_add_trace_msg. - When both bfq console logging and blktrace_api logging are - disabled, include/linux/blktrace_api expands to - do { } while (0), leaving the code block local variable unused. - - bfq_log_bfqq() had similar behavior but is never called with - a potentially unused variable. This patch fixes that macro for - consistency. - - bfq-sq.h (single queue) with blktrace_api enabled, and the bfq - console logging macros have code paths which not trigger this - warning. - - kernel.org (4.12 & 4.13) bfq (bfq-iosched.h) could trigger - the warning but no code does so now. This patch fixes - bfq-iosched.h for consistency. - - The style above enables a software engineering approach where - complex expressions are moved to a local variable before the - bfq_log* call. This makes it easier to read the expression and - use breakpoints to verify it. bfq-mq uses this approach in - several places. - - New bfq_log* macros are provided for the no-logging case. - I touch only the second argument, because current code never - uses the local variable approach with the first or other - arguments. I tried to balance consistency with simplicity. - - 2) For bfq-sq, reduce to zero, the number of instructions executed - when no logging is configured. No sense marshaling arguments - which are never going to be used. - - On a trial V8R11 builds, this reduced the size of bfq-iosched.o - by 14.3 KiB. The size went from 70304 to 55664 bytes. - - bfq-mq and kernel.org bfq code size does not change because - existing macros already optimize to zero bytes when not logging. - The current changes maintains consistency with the bfq-sq path - and makes the bfq-mq & bfq no-logging paths resistant to future - logging path macro changes which might cause generated code. - - 3) Slightly reduce compile time of all bfq variants by including - blktrace_api.h only when it will be used. - -Signed-off-by: Lee Tibbert <lee.tibbert@gmail.com> ---- - block/bfq-mq.h | 18 +++++++++++++++++- - block/bfq.h | 18 +++++++++++++++++- - 2 files changed, 34 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 77ab0f22ed22..7ed2cc29be57 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -15,7 +15,6 @@ - #ifndef _BFQ_H - #define _BFQ_H - --#include <linux/blktrace_api.h> - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - -@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -@@ -752,6 +766,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ -diff --git a/block/bfq.h b/block/bfq.h -index 53954d1b87f8..15d326f466b7 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -15,7 +15,6 @@ - #ifndef _BFQ_H - #define _BFQ_H - --#include <linux/blktrace_api.h> - #include <linux/hrtimer.h> - #include <linux/blk-cgroup.h> - -@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -@@ -759,6 +773,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ - -From f11a0e751e741bf94c6a48234824d50b3c0100ad Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 9 Aug 2017 16:40:39 +0200 -Subject: [PATCH 36/51] bfq-sq: fix commit "Remove all get and put of I/O - contexts" in branch bfq-mq - -The commit "Remove all get and put of I/O contexts" erroneously removed -the reset of the field in_service_bic for bfq-sq. This commit re-adds -that missing reset. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 7 +++++++ - block/bfq-sq-iosched.c | 1 + - 2 files changed, 8 insertions(+) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 9c4e6797d8c9..7425824c26b8 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1904,6 +1904,13 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; - -+#ifndef BFQ_MQ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+#endif -+ - bfq_clear_bfqq_wait_request(in_serv_bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 25da0d1c0622..e1960bf149d8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3765,6 +3765,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); -+ BUG_ON(!bfqd->in_service_bic); - } - - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - -From eceae5457530df8598557767d7be258ca9384de4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 9 Aug 2017 22:29:01 +0200 -Subject: [PATCH 37/51] bfq-sq-mq: make lookup_next_entity push up vtime on - expirations - -To provide a very smooth service, bfq starts to serve a bfq_queue -only if the queue is 'eligible', i.e., if the same queue would -have started to be served in the ideal, perfectly fair system that -bfq simulates internally. This is obtained by associating each -queue with a virtual start time, and by computing a special system -virtual time quantity: a queue is eligible only if the system -virtual time has reached the virtual start time of the -queue. Finally, bfq guarantees that, when a new queue must be set -in service, there is always at least one eligible entity for each -active parent entity in the scheduler. To provide this guarantee, -the function __bfq_lookup_next_entity pushes up, for each parent -entity on which it is invoked, the system virtual time to the -minimum among the virtual start times of the entities in the -active tree for the parent entity (more precisely, the push up -occurs if the system virtual time happens to be lower than all -such virtual start times). - -There is however a circumstance in which __bfq_lookup_next_entity -cannot push up the system virtual time for a parent entity, even -if the system virtual time is lower than the virtual start times -of all the child entities in the active tree. It happens if one of -the child entities is in service. In fact, in such a case, there -is already an eligible entity, the in-service one, even if it may -not be not present in the active tree (because in-service entities -may be removed from the active tree). - -Unfortunately, in the last re-design of the -hierarchical-scheduling engine, the reset of the pointer to the -in-service entity for a given parent entity--reset to be done as a -consequence of the expiration of the in-service entity--always -happens after the function __bfq_lookup_next_entity has been -invoked. This causes the function to think that there is still an -entity in service for the parent entity, and then that the system -virtual time cannot be pushed up, even if actually such a -no-more-in-service entity has already been properly reinserted -into the active tree (or in some other tree if no more -active). Yet, the system virtual time *had* to be pushed up, to be -ready to correctly choose the next queue to serve. Because of the -lack of this push up, bfq may wrongly set in service a queue that -had been speculatively pre-computed as the possible -next-in-service queue, but that would no more be the one to serve -after the expiration and the reinsertion into the active trees of -the previously in-service entities. - -This commit addresses this issue by making -__bfq_lookup_next_entity properly push up the system virtual time -if an expiration is occurring. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 4 +-- - block/bfq-sched.c | 77 ++++++++++++++++++++++++++++++++------------------ - block/bfq-sq-iosched.c | 4 +-- - 3 files changed, 53 insertions(+), 32 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 49ffca1ad6e7..b5c848650375 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -682,7 +682,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, false); - } - } - -@@ -2822,7 +2822,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, true); - /* - * Resort priority tree of potential close cooperators. - */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 7425824c26b8..f3001af37256 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -33,7 +33,8 @@ static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) - return rb_entry(node, struct bfq_entity, rb_node); - } - --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration); - - static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - -@@ -43,6 +44,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - * @new_entity: if not NULL, pointer to the entity whose activation, - * requeueing or repositionig triggered the invocation of - * this function. -+ * @expiration: id true, this function is being invoked after the -+ * expiration of the in-service entity - * - * This function is called to update sd->next_in_service, which, in - * its turn, may change as a consequence of the insertion or -@@ -61,7 +64,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - * entity. - */ - static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -- struct bfq_entity *new_entity) -+ struct bfq_entity *new_entity, -+ bool expiration) - { - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; -@@ -120,7 +124,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - if (replace_next) - next_in_service = new_entity; - } else /* invoked because of a deactivation: lookup needed */ -- next_in_service = bfq_lookup_next_entity(sd); -+ next_in_service = bfq_lookup_next_entity(sd, expiration); - - if (next_in_service) { - parent_sched_may_change = !sd->next_in_service || -@@ -1291,10 +1295,12 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - * @requeue: true if this is a requeue, which implies that bfqq is - * being expired; thus ALL its ancestors stop being served and must - * therefore be requeued -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue - */ - static void bfq_activate_requeue_entity(struct bfq_entity *entity, - bool non_blocking_wait_rq, -- bool requeue) -+ bool requeue, bool expiration) - { - struct bfq_sched_data *sd; - -@@ -1307,7 +1313,8 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, - RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && - RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); - -- if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ if (!bfq_update_next_in_service(sd, entity, expiration) && -+ !requeue) { - BUG_ON(!sd->next_in_service); - break; - } -@@ -1373,6 +1380,8 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put into the idle tree -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue - */ - static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, -@@ -1417,7 +1426,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - * then, since entity has just been - * deactivated, a new one must be found. - */ -- bfq_update_next_in_service(sd, NULL); -+ bfq_update_next_in_service(sd, NULL, expiration); - - if (sd->next_in_service || sd->in_service_entity) { - /* -@@ -1495,7 +1504,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - "invoking udpdate_next for this entity"); - } - #endif -- if (!bfq_update_next_in_service(sd, entity) && -+ if (!bfq_update_next_in_service(sd, entity, expiration) && - !expiration) - /* - * next_in_service unchanged or not causing -@@ -1524,7 +1533,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", -- root_entity->min_start); -+ ((root_entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -1533,7 +1542,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_vtime_jump: new value %llu", -- root_entity->min_start); -+ ((root_entity->min_start>>10)*1000)>>12); - } - #endif - return root_entity->min_start; -@@ -1615,17 +1624,9 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, - * 3) is idle. - */ - static struct bfq_entity * --__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service --#if 0 -- , bool force --#endif -- ) -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - { -- struct bfq_entity *entity --#if 0 -- , *new_next_in_service = NULL --#endif -- ; -+ struct bfq_entity *entity; - u64 new_vtime; - struct bfq_queue *bfqq; - -@@ -1667,8 +1668,9 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__lookup_next: start %llu vtime %llu st %p", -+ "__lookup_next: start %llu vtime %llu (%llu) st %p", - ((entity->start>>10)*1000)>>12, -+ ((st->vtime>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); - } - #endif -@@ -1681,12 +1683,14 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - /** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. -+ * @expiration: true if we are on the expiration path of the in-service queue - * - * This function is invoked when there has been a change in the trees -- * for sd, and we need know what is the new next entity after this -- * change. -+ * for sd, and we need to know what is the new next entity to serve -+ * after this change. - */ --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration) - { - struct bfq_service_tree *st = sd->service_tree; - struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -@@ -1716,8 +1720,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - * class, unless the idle class needs to be served. - */ - for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ /* -+ * If expiration is true, then bfq_lookup_next_entity -+ * is being invoked as a part of the expiration path -+ * of the in-service queue. In this case, even if -+ * sd->in_service_entity is not NULL, -+ * sd->in_service_entiy at this point is actually not -+ * in service any more, and, if needed, has already -+ * been properly queued or requeued into the right -+ * tree. The reason why sd->in_service_entity is still -+ * not NULL here, even if expiration is true, is that -+ * sd->in_service_entiy is reset as a last step in the -+ * expiration path. So, if expiration is true, tell -+ * __bfq_lookup_next_entity that there is no -+ * sd->in_service_entity. -+ */ - entity = __bfq_lookup_next_entity(st + class_idx, -- sd->in_service_entity); -+ sd->in_service_entity && -+ !expiration); - - if (entity) - break; -@@ -1891,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->sched_data; - -- if(!bfq_update_next_in_service(sd, NULL)) -+ if (!bfq_update_next_in_service(sd, NULL, false)) - break; - } - -@@ -1951,16 +1971,17 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - entity->on_st); - - bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -- false); -+ false, false); - bfq_clear_bfqq_non_blocking_wait_rq(bfqq); - } - --static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) - { - struct bfq_entity *entity = &bfqq->entity; - - bfq_activate_requeue_entity(entity, false, -- bfqq == bfqd->in_service_queue); -+ bfqq == bfqd->in_service_queue, expiration); - } - - static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e1960bf149d8..42393ab889a9 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -644,7 +644,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, false); - } - } - -@@ -2715,7 +2715,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, true); - /* - * Resort priority tree of potential close cooperators. - */ - -From ee9f95b24e1d88ffba4845981c2a4684aefd0245 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 9 Aug 2017 22:53:00 +0200 -Subject: [PATCH 38/51] bfq-sq-mq: remove direct switch to an entity in higher - class - -If the function bfq_update_next_in_service is invoked as a consequence -of the activation or requeueing of an entity, say E, and finds out -that E belongs to a higher-priority class than that of the current -next-in-service entity, then it sets next_in_service directly to -E. But this may lead to anomalous schedules, because E may happen not -be eligible for service, because its virtual start time is higher than -the system virtual time for its service tree. - -This commit addresses this issue by simply removing this direct -switch. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 19 +++++-------------- - 1 file changed, 5 insertions(+), 14 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index f3001af37256..b1a59088db88 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -76,9 +76,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - * or repositiong of an entity that does not coincide with - * sd->next_in_service, then a full lookup in the active tree - * can be avoided. In fact, it is enough to check whether the -- * just-modified entity has a higher priority than -- * sd->next_in_service, or, even if it has the same priority -- * as sd->next_in_service, is eligible and has a lower virtual -+ * just-modified entity has the same priority as -+ * sd->next_in_service, is eligible and has a lower virtual - * finish time than sd->next_in_service. If this compound - * condition holds, then the new entity becomes the new - * next_in_service. Otherwise no change is needed. -@@ -94,9 +93,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - - /* - * If there is already a next_in_service candidate -- * entity, then compare class priorities or timestamps -- * to decide whether to replace sd->service_tree with -- * new_entity. -+ * entity, then compare timestamps to decide whether -+ * to replace sd->service_tree with new_entity. - */ - if (next_in_service) { - unsigned int new_entity_class_idx = -@@ -104,10 +102,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - -- /* -- * For efficiency, evaluate the most likely -- * sub-condition first. -- */ - replace_next = - (new_entity_class_idx == - bfq_class_idx(next_in_service) -@@ -115,10 +109,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - !bfq_gt(new_entity->start, st->vtime) - && - bfq_gt(next_in_service->finish, -- new_entity->finish)) -- || -- new_entity_class_idx < -- bfq_class_idx(next_in_service); -+ new_entity->finish)); - } - - if (replace_next) - -From a3fdc5af40537355b68c1f0d3997c5a5fb54b9ce Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 10 Aug 2017 08:15:50 +0200 -Subject: [PATCH 39/51] bfq-sq-mq: guarantee update_next_in_service always - returns an eligible entity - -If the function bfq_update_next_in_service is invoked as a consequence -of the activation or requeueing of an entity, say E, then it doesn't -invoke bfq_lookup_next_entity to get the next-in-service entity. In -contrast, it follows a shorter path: if E happens to be eligible (see -commit "bfq-sq-mq: make lookup_next_entity push up vtime on -expirations" for details on eligibility) and to have a lower virtual -finish time than the current candidate as next-in-service entity, then -E directly becomes the next-in-service entity. Unfortunately, there is -a corner case for which this shorter path makes -bfq_update_next_in_service choose a non eligible entity: it occurs if -both E and the current next-in-service entity happen to be non -eligible when bfq_update_next_in_service is invoked. In this case, E -is not set as next-in-service, and, since bfq_lookup_next_entity is -not invoked, the state of the parent entity is not updated so as to -end up with an eligible entity as the proper next-in-service entity. - -In this respect, next-in-service is actually allowed to be non -eligible while some queue is in service: since no system-virtual-time -push-up can be performed in that case (see again commit "bfq-sq-mq: -make lookup_next_entity push up vtime on expirations" for details), -next-in-service is chosen, speculatively, as a function of the -possible value that the system virtual time may get after a push -up. But the correctness of the schedule breaks if next-in-service is -still a non eligible entity when it is time to set in service the next -entity. Unfortunately, this may happen in the above corner case. - -This commit fixes this problem by making bfq_update_next_in_service -invoke bfq_lookup_next_entity not only if the above shorter path -cannot be taken, but also if the shorter path is taken but fails to -yield an eligible next-in-service entity. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-sched.c | 38 ++++++++++++++++++++++++++++---------- - 1 file changed, 28 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index b1a59088db88..e4a2553a2d2c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -70,6 +70,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; - bool parent_sched_may_change = false; -+ bool change_without_lookup = false; - - /* - * If this update is triggered by the activation, requeueing -@@ -89,7 +90,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - * set to true, and left as true if - * sd->next_in_service is NULL. - */ -- bool replace_next = true; -+ change_without_lookup = true; - - /* - * If there is already a next_in_service candidate -@@ -102,7 +103,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - -- replace_next = -+ change_without_lookup = - (new_entity_class_idx == - bfq_class_idx(next_in_service) - && -@@ -112,15 +113,32 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - new_entity->finish)); - } - -- if (replace_next) -+ if (change_without_lookup) { - next_in_service = new_entity; -- } else /* invoked because of a deactivation: lookup needed */ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chose without lookup"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -+ "update_next_in_service: chose without lookup"); -+ } -+#endif -+ } -+ } -+ -+ if (!change_without_lookup) /* lookup needed */ - next_in_service = bfq_lookup_next_entity(sd, expiration); - -- if (next_in_service) { -+ if (next_in_service) - parent_sched_may_change = !sd->next_in_service || - bfq_update_parent_budget(next_in_service); -- } - - sd->next_in_service = next_in_service; - -@@ -1053,7 +1071,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__activate_entity: new queue finish %llu", -+ "update_fin_time_enqueue: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1061,7 +1079,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__activate_entity: new group finish %llu", -+ "update_fin_time_enqueue: new group finish %llu", - ((entity->finish>>10)*1000)>>12); - #endif - } -@@ -1071,7 +1089,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__activate_entity: queue %seligible in st %p", -+ "update_fin_time_enqueue: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1079,7 +1097,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__activate_entity: group %seligible in st %p", -+ "update_fin_time_enqueue: group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #endif - } - -From 6565e4d1aac029b6f0a5d86a4c6ef38608838eac Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 31 Aug 2017 19:24:26 +0200 -Subject: [PATCH 40/51] doc, block, bfq: fix some typos and stale sentences - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Reviewed-by: Jeremy Hickman <jeremywh7@gmail.com> -Reviewed-by: Laurentiu Nicola <lnicola@dend.ro> ---- - Documentation/block/bfq-iosched.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 0e59f1c9d30e..dcfe15523da3 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -17,7 +17,7 @@ instances of BFQ are available (otherwise only the first instance): - - bfq-mq: development version of BFQ for blk-mq; this version contains - also all latest features and fixes not yet landed in mainline, plus many - safety checks --- bfq: BFQ for legacy blk; also this version contains latest features -+- bfq-sq: BFQ for legacy blk; also this version contains latest features - and fixes, as well as safety checks - - In its default configuration, BFQ privileges latency over - -From 261ee8cc9f43e03d790a07184f0bcaa504ee6737 Mon Sep 17 00:00:00 2001 -From: Luca Miccio <lucmiccio@gmail.com> -Date: Wed, 13 Sep 2017 12:03:56 +0200 -Subject: [PATCH 41/51] bfq-mq, bfq-sq: Disable writeback throttling - -Similarly to CFQ, BFQ has its write-throttling heuristics, and it -is better not to combine them with further write-throttling -heuristics of a different nature. -So this commit disables write-back throttling for a device if BFQ -is used as I/O scheduler for that device. - -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> ---- - block/bfq-mq-iosched.c | 2 ++ - block/bfq-sq-iosched.c | 7 +++++++ - 2 files changed, 9 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b5c848650375..7d27d5b3befb 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -89,6 +89,7 @@ - #include "blk-mq-tag.h" - #include "blk-mq-sched.h" - #include "bfq-mq.h" -+#include "blk-wbt.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -@@ -5260,6 +5261,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - -+ wbt_disable_default(q); - return 0; - - out_free: -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 42393ab889a9..6fdc3b1d5bb8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -83,6 +83,7 @@ - #include <linux/ioprio.h> - #include "blk.h" - #include "bfq.h" -+#include "blk-wbt.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -@@ -4976,6 +4977,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - return -ENOMEM; - } - -+static void bfq_registered_queue(struct request_queue *q) -+{ -+ wbt_disable_default(q); -+} -+ - static void bfq_slab_kill(void) - { - kmem_cache_destroy(bfq_pool); -@@ -5285,6 +5291,7 @@ static struct elevator_type iosched_bfq = { - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, -+ .elevator_registered_fn = bfq_registered_queue, - }, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - -From 40ea0aed088791da27fcfa51f3b64d1f96b0d06e Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 12 Sep 2017 16:45:53 +0200 -Subject: [PATCH 42/51] bfq-mq, bfq-sq: fix wrong init of saved start time for - weight raising - -This commit fixes a bug that causes bfq to fail to guarantee a high -responsiveness on some drives, if there is heavy random read+write I/O -in the background. More precisely, such a failure allowed this bug to -be found [1], but the bug may well cause other yet unreported -anomalies. - -BFQ raises the weight of the bfq_queues associated with soft real-time -applications, to privilege the I/O, and thus reduce latency, for these -applications. This mechanism is named soft-real-time weight raising in -BFQ. A soft real-time period may happen to be nested into an -interactive weight raising period, i.e., it may happen that, when a -bfq_queue switches to a soft real-time weight-raised state, the -bfq_queue is already being weight-raised because deemed interactive -too. In this case, BFQ saves in a special variable -wr_start_at_switch_to_srt, the time instant when the interactive -weight-raising period started for the bfq_queue, i.e., the time -instant when BFQ started to deem the bfq_queue interactive. This value -is then used to check whether the interactive weight-raising period -would still be in progress when the soft real-time weight-raising -period ends. If so, interactive weight raising is restored for the -bfq_queue. This restore is useful, in particular, because it prevents -bfq_queues from losing their interactive weight raising prematurely, -as a consequence of spurious, short-lived soft real-time -weight-raising periods caused by wrong detections as soft real-time. - -If, instead, a bfq_queue switches to soft-real-time weight raising -while it *is not* already in an interactive weight-raising period, -then the variable wr_start_at_switch_to_srt has no meaning during the -following soft real-time weight-raising period. Unfortunately the -handling of this case is wrong in BFQ: not only the variable is not -flagged somehow as meaningless, but it is also set to the time when -the switch to soft real-time weight-raising occurs. This may cause an -interactive weight-raising period to be considered mistakenly as still -in progress, and thus a spurious interactive weight-raising period to -start for the bfq_queue, at the end of the soft-real-time -weight-raising period. In particular the spurious interactive -weight-raising period will be considered as still in progress, if the -soft-real-time weight-raising period does not last very long. The -bfq_queue will then be wrongly privileged and, if I/O bound, will -unjustly steal bandwidth to truly interactive or soft real-time -bfq_queues, harming responsiveness and low latency. - -This commit fixes this issue by just setting wr_start_at_switch_to_srt -to minus infinity (farthest past time instant according to jiffies -macros): when the soft-real-time weight-raising period ends, certainly -no interactive weight-raising period will be considered as still in -progress. - -[1] Background I/O Type: Random - Background I/O mix: Reads and writes -- Application to start: LibreOffice Writer in -http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> -Tested-by: Lee Tibbert <lee.tibbert@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- - block/bfq-sq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- - 2 files changed, 62 insertions(+), 38 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 7d27d5b3befb..f378519b6d33 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1204,6 +1204,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - return wr_or_deserves_wr; - } - -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ - static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, -@@ -1218,7 +1236,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -- bfqq->wr_start_at_switch_to_srt = jiffies; -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = -@@ -3174,24 +3204,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - --/* -- * Return the farthest future time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_greatest_from_now(void) --{ -- return jiffies + MAX_JIFFY_OFFSET; --} -- --/* -- * Return the farthest past time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_smallest_from_now(void) --{ -- return jiffies - MAX_JIFFY_OFFSET; --} -- - /** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 6fdc3b1d5bb8..f4654436cd55 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -1165,6 +1165,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - return wr_or_deserves_wr; - } - -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ - static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, -@@ -1179,7 +1197,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -- bfqq->wr_start_at_switch_to_srt = jiffies; -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = -@@ -3067,24 +3097,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - --/* -- * Return the farthest future time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_greatest_from_now(void) --{ -- return jiffies + MAX_JIFFY_OFFSET; --} -- --/* -- * Return the farthest past time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_smallest_from_now(void) --{ -- return jiffies - MAX_JIFFY_OFFSET; --} -- - /** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. - -From 9dbea44b6f721baeff35b9fdf628ec55fe00e09d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 14 Sep 2017 05:12:58 -0400 -Subject: [PATCH 43/51] Fix commit "Unnest request-queue and ioc locks from - scheduler locks" - -The commit "Unnest request-queue and ioc locks from scheduler locks" -mistakenly removed the setting of the split flag in function -bfq_prepare_request. This commit puts this missing instruction back in -its place. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f378519b6d33..288078e68a2a 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -744,6 +744,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -@@ -2208,6 +2214,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); - } - - static void -@@ -4950,6 +4961,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; - - if (!bfqq) - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, - -From d4ebb2a66a23dc183792088c521f2be2193b56db Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 15 Sep 2017 01:53:51 -0400 -Subject: [PATCH 44/51] bfq-sq, bfq-mq: check and switch back to interactive wr - also on queue split - -As already explained in the message of commit "bfq-mq, bfq-sq: fix -wrong init of saved start time for weight raising", if a soft -real-time weight-raising period happens to be nested in a larger -interactive weight-raising period, then BFQ restores the interactive -weight raising at the end of the soft real-time weight raising. In -particular, BFQ checks whether the latter has ended only on request -dispatches. - -Unfortunately, the above scheme fails to restore interactive weight -raising in the following corner case: if a bfq_queue, say Q, -1) Is merged with another bfq_queue while it is in a nested soft -real-time weight-raising period. The weight-raising state of Q is -then saved, and not considered any longer until a split occurs. -2) Is split from the other bfq_queue(s) at a time instant when its -soft real-time weight raising is already finished. -On the split, while resuming the previous, soft real-time -weight-raised state of the bfq_queue Q, BFQ checks whether the -current soft real-time weight-raising period is actually over. If so, -BFQ switches weight raising off for Q, *without* checking whether the -soft real-time period was actually nested in a non-yet-finished -interactive weight-raising period. - -This commit addresses this issue by adding the above missing check in -bfq_queue splits, and restoring interactive weight raising if needed. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 29 +++++++++++++++++++++-------- - block/bfq-sq-iosched.c | 35 +++++++++++++++++++++++++++-------- - 2 files changed, 48 insertions(+), 16 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 288078e68a2a..6130a95c6497 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -716,6 +716,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ - static void - bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) -@@ -753,12 +762,20 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); -- -- bfqq->wr_coeff = 1; -+ } - } - - /* make sure weight will be updated, however we got here */ -@@ -3820,11 +3837,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { -- /* switch back to interactive wr */ -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- bfqq->last_wr_start_finish = -- bfqq->wr_start_at_switch_to_srt; -+ switch_back_to_interactive_wr(bfqq, bfqd); - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index f4654436cd55..e07d5d1c0d40 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -678,6 +678,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ - static void - bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) -@@ -705,15 +714,29 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); -- -- bfqq->wr_coeff = 1; -+ } - } - - /* make sure weight will be updated, however we got here */ -@@ -3703,11 +3726,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { -- /* switch back to interactive wr */ -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- bfqq->last_wr_start_finish = -- bfqq->wr_start_at_switch_to_srt; -+ switch_back_to_interactive_wr(bfqq, bfqd); - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; - -From 9eaec0c3a2d675763b09da81c9117a9c43bce942 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 15 Sep 2017 04:58:33 -0400 -Subject: [PATCH 45/51] bfq-sq, bfq-mq: let early-merged queues be - weight-raised on split too - -A just-created bfq_queue, say Q, may happen to be merged with another -bfq_queue on the very first invocation of the function -__bfq_insert_request. In such a case, even if Q would clearly deserve -interactive weight raising (as it has just been created), the function -bfq_add_request does not make it to be invoked for Q, and thus to -activate weight raising for Q. As a consequence, when the state of Q -is saved for a possible future restore, after a split of Q from the -other bfq_queue(s), such a state happens to be (unjustly) -non-weight-raised. Then the bfq_queue will not enjoy any weight -raising on the split, even if should still be in an interactive -weight-raising period when the split occurs. - -This commit solves this problem as follows, for a just-created -bfq_queue that is being early-merged: it stores directly, in the saved -state of the bfq_queue, the weight-raising state that would have been -assigned to the bfq_queue if not early-merged. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Tested-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 28 +++++++++++++++++++++++----- - block/bfq-sq-iosched.c | 28 +++++++++++++++++++++++----- - 2 files changed, 46 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6130a95c6497..af84e506e897 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2226,10 +2226,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bic->saved_wr_coeff = bfqq->wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq))) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -@@ -4560,7 +4577,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - bfqq->allocated); - - new_bfqq->ref++; -- bfq_clear_bfqq_just_created(bfqq); - /* - * If the bic associated with the process - * issuing this request still points to bfqq -@@ -4572,6 +4588,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e07d5d1c0d40..0c48f527fe3f 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2105,10 +2105,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bic->saved_wr_coeff = bfqq->wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq))) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - -@@ -4383,10 +4400,11 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; - new_bfqq->ref++; -- bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq - -From cb05150675095cb97ab22e4955eb82e4fe2e9dbe Mon Sep 17 00:00:00 2001 -From: omcira <omcira@gmail.com> -Date: Mon, 18 Sep 2017 10:49:48 +0200 -Subject: [PATCH 46/51] bfq-sq, bfq-mq: decrease burst size when queues in - burst exit - -If many queues belonging to the same group happen to be created -shortly after each other, then the concurrent processes associated -with these queues have typically a common goal, and they get it done -as soon as possible if not hampered by device idling. Examples are -processes spawned by git grep, or by systemd during boot. As for -device idling, this mechanism is currently necessary for weight -raising to succeed in its goal: privileging I/O. In view of these -facts, BFQ does not provide the above queues with either weight -raising or device idling. - -On the other hand, a burst of queue creations may be caused also by -the start-up of a complex application. In this case, these queues need -usually to be served one after the other, and as quickly as possible, -to maximise responsiveness. Therefore, in this case the best strategy -is to weight-raise all the queues created during the burst, i.e., the -exact opposite of the strategy for the above case. - -To distinguish between the two cases, BFQ uses an empirical burst-size -threshold, found through extensive tests and monitoring of daily -usage. Only large bursts, i.e., burst with a size above this -threshold, are considered as generated by a high number of parallel -processes. In this respect, upstart-based boot proved to be rather -hard to detect as generating a large burst of queue creations, because -with upstart most of the queues created in a burst exit *before* the -next queues in the same burst are created. To address this issue, I -changed the burst-detection mechanism so as to not decrease the size -of the current burst even if one of the queues in the burst is -eliminated. - -Unfortunately, this missing decrease causes false positives on very -fast systems: on the start-up of a complex application, such as -libreoffice writer, so many queues are created, served and exited -shortly after each other, that a large burst of queue creations is -wrongly detected as occurring. These false positives just disappear if -the size of a burst is decreased when one of the queues in the burst -exits. This commit restores the missing burst-size decrease, relying -of the fact that upstart is apparently unlikely to be used on systems -running this and future versions of the kernel. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Mirko Montanari <mirkomontanari91@gmail.com> ---- - block/bfq-mq-iosched.c | 12 +++--------- - block/bfq-sq-iosched.c | 12 +++--------- - 2 files changed, 6 insertions(+), 18 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index af84e506e897..6e413d7236ce 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4111,16 +4111,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq)) -- /* -- * The fact that this queue is being destroyed does not -- * invalidate the fact that this queue may have been -- * activated during the current burst. As a consequence, -- * although the queue does not exist anymore, and hence -- * needs to be removed from the burst list if there, -- * the burst size has not to be decremented. -- */ -+ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -+ bfqq->bfqd->burst_size--; -+ } - - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 0c48f527fe3f..93034dd7b801 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3945,16 +3945,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq)) -- /* -- * The fact that this queue is being destroyed does not -- * invalidate the fact that this queue may have been -- * activated during the current burst. As a consequence, -- * although the queue does not exist anymore, and hence -- * needs to be removed from the burst list if there, -- * the burst size has not to be decremented. -- */ -+ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -+ bfqq->bfqd->burst_size--; -+ } - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - -From 60de7307d5e3ed7f272f12c900f631bdfe114db2 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 6 Oct 2017 19:35:38 +0200 -Subject: [PATCH 47/51] bfq-sq, bfq-mq: fix unbalanced decrements of burst size -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The commit "bfq-sq, bfq-mq: decrease burst size when queues in burst -exit" introduced the decrement of burst_size on the removal of a -bfq_queue from the burst list. Unfortunately, this decrement can -happen to be performed even when burst size is already equal to 0, -because of unbalanced decrements. A description follows of the cause -of these unbalanced decrements, namely a wrong assumption, and of the -way how this wrong assumption leads to unbalanced decrements. - -The wrong assumption is that a bfq_queue can exit only if the process -associated with the bfq_queue has exited. This is false, because a -bfq_queue, say Q, may exit also as a consequence of a merge with -another bfq_queue. In this case, Q exits because the I/O of its -associated process has been redirected to another bfq_queue. - -The decrement unbalance occurs because Q may then be re-created after -a split, and added back to the current burst list, *without* -incrementing burst_size. burst_size is not incremented because Q is -not a new bfq_queue added to the burst list, but a bfq_queue only -temporarily removed from the list, and, before the commit "bfq-sq, -bfq-mq: decrease burst size when queues in burst exit", burst_size was -not decremented when Q was removed. - -This commit addresses this issue by just checking whether the exiting -bfq_queue is a merged bfq_queue, and, in that case, not decrementing -burst_size. Unfortunately, this still leaves room for unbalanced -decrements, in the following rarer case: on a split, the bfq_queue -happens to be inserted into a different burst list than that it was -removed from when merged. If this happens, the number of elements in -the new burst list becomes higher than burst_size (by one). When the -bfq_queue then exits, it is of course not in a merged state any -longer, thus burst_size is decremented, which results in an unbalanced -decrement. To handle this sporadic, unlucky case in a simple way, -this commit also checks that burst_size is larger than 0 before -decrementing it. - -Finally, this commit removes an useless, extra check: the check that -the bfq_queue is sync, performed before checking whether the bfq_queue -is in the burst list. This extra check is redundant, because only sync -bfq_queues can be inserted into the burst list. - -Reported-by: Philip Müller <philm@manjaro.org> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Tested-by: Philip Müller <philm@manjaro.org> -Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> -Tested-by: Lee Tibbert <lee.tibbert@gmail.com> ---- - block/bfq-mq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- - block/bfq-sq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- - 2 files changed, 114 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6e413d7236ce..816bac6cdd3d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4111,9 +4111,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -- bfqq->bfqd->burst_size--; -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; - } - - if (bfqq->bfqd) -@@ -4940,6 +4967,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 93034dd7b801..4bbd7f4c0154 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3945,9 +3945,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -- bfqq->bfqd->burst_size--; -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; - } - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -@@ -4691,6 +4718,34 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } - -From 09adbd0f46f4ba395964b35bf611b7cc3dd84b4d Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 30 Oct 2017 16:50:50 +0100 -Subject: [PATCH 48/51] doc, block, bfq-mq: update max IOPS sustainable with - BFQ - -We have investigated more deeply the performance of BFQ, in terms of -number of IOPS that can be processed by the CPU when BFQ is used as -I/O scheduler. In more detail, using the script [1], we have measured -the number of IOPS reached on top of a null block device configured -with zero latency, as a function of the workload (sequential read, -sequential write, random read, random write) and of the system (we -considered desktops, laptops and embedded systems). - -Basing on the resulting figures, with this commit we update the -current, conservative IOPS range reported in BFQ documentation. In -particular, the documentation now reports, for each of three different -systems, the lowest number of IOPS obtained for that system with the -above test (namely, the value obtained with the workload leading to -the lowest IOPS). - -[1] https://github.com/Algodev-github/IOSpeed - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - Documentation/block/bfq-iosched.txt | 19 +++++++++++++------ - 1 file changed, 13 insertions(+), 6 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index dcfe15523da3..595ff7a5ff34 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -29,12 +29,19 @@ for that device, by setting low_latency to 0. See Section 3 for - details on how to configure BFQ for the desired tradeoff between - latency and throughput, or on how to maximize throughput. - --On average CPUs, the current version of BFQ can handle devices --performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a --reference, 30-50 KIOPS correspond to very high bandwidths with --sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and --to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on --multi-queue devices too. -+BFQ has a non-null overhead, which limits the maximum IOPS that the -+CPU can process for a device scheduled with BFQ. To give an idea of -+the limits on slow or average CPUs, here are BFQ limits for three -+different CPUs, on, respectively, an average laptop, an old desktop, -+and a cheap embedded system, in case full hierarchical support is -+enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or -+CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, -+CONFIG_BFQ_GROUP_IOSCHED is set for bfq): -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS -+ -+BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. - - -From be94f97b577dc587593185224a7718aa59ac43f7 Mon Sep 17 00:00:00 2001 -From: Luca Miccio <lucmiccio@gmail.com> -Date: Tue, 31 Oct 2017 09:50:11 +0100 -Subject: [PATCH 49/51] block, bfq-mq: add missing invocations of - bfqg_stats_update_io_add/remove - -bfqg_stats_update_io_add and bfqg_stats_update_io_remove are to be -invoked, respectively, when an I/O request enters and when an I/O -request exits the scheduler. Unfortunately, bfq-mq does not fully comply -with this scheme, because it does not invoke these functions for -requests that are inserted into or extracted from its priority -dispatch list. This commit fixes this mistake. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - block/bfq-mq-iosched.c | 24 +++++++++++++++++++----- - 1 file changed, 19 insertions(+), 5 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 816bac6cdd3d..fbf28804c220 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1394,7 +1394,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfqq == bfqd->in_service_queue); -- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); - - /* - * bfqq deserves to be weight-raised if: -@@ -1734,7 +1733,6 @@ static void bfq_remove_request(struct request_queue *q, - BUG_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } -- bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); - } - - static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -@@ -1879,6 +1877,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - bfqq->next_rq = rq; - - bfq_remove_request(q, next); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags); - - spin_unlock_irq(&bfqq->bfqd->lock); - end: -@@ -4077,6 +4076,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - spin_lock_irq(&bfqd->lock); - - rq = __bfq_dispatch_request(hctx); -+ if (rq && RQ_BFQQ(rq)) -+ bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), -+ rq->cmd_flags); -+ - spin_unlock_irq(&bfqd->lock); - - return rq; -@@ -4634,6 +4637,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - { - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4647,8 +4651,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else -@@ -4668,6 +4670,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - rq->rq_flags &= ~RQF_GOT; - - __bfq_insert_request(bfqd, rq); -+ /* -+ * Update bfqq, because, if a queue merge has occurred -+ * in __bfq_insert_request, then rq has been -+ * redirected into a new queue. -+ */ -+ bfqq = RQ_BFQQ(rq); - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4676,6 +4684,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - } - } - -+ if (bfqq) -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); -+ - spin_unlock_irq(&bfqd->lock); - } - -@@ -4893,8 +4904,11 @@ static void bfq_finish_request(struct request *rq) - BUG_ON(in_interrupt()); - - assert_spin_locked(&bfqd->lock); -- if (!RB_EMPTY_NODE(&rq->rb_node)) -+ if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), -+ rq->cmd_flags); -+ } - bfq_put_rq_priv_body(bfqq); - } - - -From 8659a1549d2bf241129a0f7c90429bddd9c2bc53 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 8 Nov 2017 19:07:40 +0100 -Subject: [PATCH 50/51] block, bfq-mq: update blkio stats outside the scheduler - lock - -bfq-mq invokes various blkg_*stats_* functions to update the statistics -contained in the special files blkio.bfq-mq.* in the blkio controller -groups, i.e., the I/O accounting related to the proportional-share -policy provided by bfq-mq. The execution of these functions takes a -considerable percentage, about 40%, of the total per-request execution -time of bfq-mq (i.e., of the sum of the execution time of all the bfq-mq -functions that have to be executed to process an I/O request from its -creation to its destruction). This reduces the request-processing -rate sustainable by bfq-mq noticeably, even on a multicore CPU. In fact, -the bfq-mq functions that invoke blkg_*stats_* functions cannot be -executed in parallel with the rest of the code of bfq-mq, because -both are executed under the same same per-device scheduler lock. - -To reduce this slowdown, this commit moves, wherever possible, the -invocation of these functions (more precisely, of the bfq-mq functions -that invoke blkg_*stats_* functions) outside the critical sections -protected by the scheduler lock. - -With this change, and with all blkio.bfq-mq.* statistics enabled, the -throughput grows, e.g., from 250 to 310 KIOPS (+25%) on an Intel -i7-4850HQ, in case of 8 threads doing random I/O in parallel on -null_blk, with the latter configured with 0 latency. We obtained the -same or higher throughput boosts, up to +30%, with other processors -(some figures are reported in the documentation). For our tests, we -used the script [1], with which our results can be easily reproduced. - -NOTE. This commit still protects the invocation of blkg_*stats_* -functions with the request_queue lock, because the group these -functions are invoked on may otherwise disappear before or while these -functions are executed. Fortunately, tests without even this lock -show, by difference, that the serialization caused by this lock has a -little impact (at most ~5% of throughput reduction). - -[1] https://github.com/Algodev-github/IOSpeed - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> ---- - Documentation/block/bfq-iosched.txt | 18 ++++-- - block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++----- - block/bfq-sched.c | 2 + - 3 files changed, 112 insertions(+), 20 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 595ff7a5ff34..c816c595082d 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -31,16 +31,22 @@ latency and throughput, or on how to maximize throughput. - - BFQ has a non-null overhead, which limits the maximum IOPS that the - CPU can process for a device scheduled with BFQ. To give an idea of --the limits on slow or average CPUs, here are BFQ limits for three --different CPUs, on, respectively, an average laptop, an old desktop, --and a cheap embedded system, in case full hierarchical support is --enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or --CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, --CONFIG_BFQ_GROUP_IOSCHED is set for bfq): -+the limits on slow or average CPUs, here are, first, the limits of -+bfq-sq for three different CPUs, on, respectively, an average laptop, -+an old desktop, and a cheap embedded system, in case full hierarchical -+support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): - - Intel i7-4850HQ: 250 KIOPS - - AMD A8-3850: 170 KIOPS - - ARM CortexTM-A53 Octa-core: 45 KIOPS - -+bfq-mq and bfq instances reach, instead, a higher sustainable -+throughput. Their limits, on the same systems as above, are, with full -+hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set -+for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): -+- Intel i7-4850HQ: 310 KIOPS -+- AMD A8-3850: 200 KIOPS -+- ARM CortexTM-A53 Octa-core: 56 KIOPS -+ - BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index fbf28804c220..ab3b83d612c2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1822,7 +1822,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfqq->next_rq = next_rq; - - bfq_log_bfqq(bfqd, bfqq, -- "requests_merged: req %p prev %p next_rq %p bfqq %p", -+ "request_merged: req %p prev %p next_rq %p bfqq %p", - req, prev, next_rq, bfqq); - - /* -@@ -2415,7 +2415,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - if (bfqq) { -- bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -@@ -3784,7 +3783,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); - } - goto keep_queue; - } -@@ -4072,16 +4070,67 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_queue *in_serv_queue, *bfqq; -+ bool waiting_rq, idle_timer_disabled; -+#endif - - spin_lock_irq(&bfqd->lock); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ - rq = __bfq_dispatch_request(hctx); -- if (rq && RQ_BFQQ(rq)) -- bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), -- rq->cmd_flags); - -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+#else -+ rq = __bfq_dispatch_request(hctx); -+#endif - spin_unlock_irq(&bfqd->lock); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfqq = rq ? RQ_BFQQ(rq) : NULL; -+ if (!idle_timer_disabled && !bfqq) -+ return rq; -+ -+ /* -+ * rq and bfqq are guaranteed to exist until this function -+ * ends, for the following reasons. First, rq can be -+ * dispatched to the device, and then can be completed and -+ * freed, only after this function ends. Second, rq cannot be -+ * merged (and thus freed because of a merge) any longer, -+ * because it has already started. Thus rq cannot be freed -+ * before this function ends, and, since rq has a reference to -+ * bfqq, the same guarantee holds for bfqq too. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(hctx->queue->queue_lock); -+ if (idle_timer_disabled) -+ /* -+ * Since the idle timer has been disabled, -+ * in_serv_queue contained some request when -+ * __bfq_dispatch_request was invoked above, which -+ * implies that rq was picked exactly from -+ * in_serv_queue. Thus in_serv_queue == bfqq, and is -+ * therefore guaranteed to exist because of the above -+ * arguments. -+ */ -+ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); -+ if (bfqq) { -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+ -+ bfqg_stats_update_avg_queue_size(bfqg); -+ bfqg_stats_set_start_empty_time(bfqg); -+ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); -+ } -+ spin_unlock_irq(hctx->queue->queue_lock); -+#endif -+ - return rq; - } - -@@ -4200,7 +4249,6 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); - spin_unlock_irqrestore(&bfqd->lock, flags); -@@ -4554,7 +4602,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); - - /* - * The queue is not empty, because a new request just -@@ -4569,9 +4616,11 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - } - } - --static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) -+/* returns true if it causes the idle timer to be disabled */ -+static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ bool waiting, idle_timer_disabled = false; - BUG_ON(!bfqq); - - assert_spin_locked(&bfqd->lock); -@@ -4624,12 +4673,16 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - } - } - -+ waiting = bfqq && bfq_bfqq_wait_request(bfqq); - bfq_add_request(rq); -+ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -+ -+ return idle_timer_disabled; - } - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -@@ -4638,6 +4691,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bool idle_timer_disabled = false; -+ unsigned int cmd_flags; -+#endif - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4669,13 +4726,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - -- __bfq_insert_request(bfqd, rq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred - * in __bfq_insert_request, then rq has been - * redirected into a new queue. - */ - bfqq = RQ_BFQQ(rq); -+#else -+ __bfq_insert_request(bfqd, rq); -+#endif - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4683,11 +4744,34 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } -- -- if (bfqq) -- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); -- -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* -+ * Cache cmd_flags before releasing scheduler lock, because rq -+ * may disappear afterwards (for example, because of a request -+ * merge). -+ */ -+ cmd_flags = rq->cmd_flags; -+#endif - spin_unlock_irq(&bfqd->lock); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (!bfqq) -+ return; -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instruction. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+#endif - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index e4a2553a2d2c..616c0692335a 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -949,9 +949,11 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } -+#ifndef BFQ_MQ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif -+#endif - st = bfq_entity_service_tree(&bfqq->entity); - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", - served, ((st->vtime>>10)*1000)>>12, st); - -From abdfb33a3325df55ec0261fd824ca61ddac13575 Mon Sep 17 00:00:00 2001 -From: Luca Miccio <lucmiccio@gmail.com> -Date: Wed, 8 Nov 2017 19:07:41 +0100 -Subject: [PATCH 51/51] block, bfq-sq, bfq-mq: move debug blkio stats behind - CONFIG_DEBUG_BLK_CGROUP - -BFQ (both bfq-mq and bfq-sq) currently creates, and updates, its own -instance of the whole set of blkio statistics that cfq creates. Yet, -from the comments of Tejun Heo in [1], it turned out that most of -these statistics are meant/useful only for debugging. This commit -makes BFQ create the latter, debugging statistics only if the option -CONFIG_DEBUG_BLK_CGROUP is set. - -By doing so, this commit also enables BFQ to enjoy a high perfomance -boost. The reason is that, if CONFIG_DEBUG_BLK_CGROUP is not set, then -BFQ has to update far fewer statistics, and, in particular, not the -heaviest to update. To give an idea of the benefits, if -CONFIG_DEBUG_BLK_CGROUP is not set, then, on an Intel i7-4850HQ, and -with 8 threads doing random I/O in parallel on null_blk (configured -with 0 latency), the throughput of bfq-mq grows from 310 to 400 KIOPS -(+30%). We have measured similar or even much higher boosts with other -CPUs: e.g., +45% with an ARM CortexTM-A53 Octa-core. Our results have -been obtained and can be reproduced very easily with the script in [1]. - -[1] https://www.spinics.net/lists/linux-block/msg18943.html - -Reported-by: Tejun Heo <tj@kernel.org> -Signed-off-by: Luca Miccio <lucmiccio@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - Documentation/block/bfq-iosched.txt | 59 ++++++++++--- - block/bfq-cgroup-included.c | 163 ++++++++++++++++++++---------------- - block/bfq-mq-iosched.c | 14 ++-- - block/bfq-mq.h | 4 +- - block/bfq.h | 4 +- - 5 files changed, 147 insertions(+), 97 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index c816c595082d..30ef2dba85ad 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -29,24 +29,41 @@ for that device, by setting low_latency to 0. See Section 3 for - details on how to configure BFQ for the desired tradeoff between - latency and throughput, or on how to maximize throughput. - --BFQ has a non-null overhead, which limits the maximum IOPS that the --CPU can process for a device scheduled with BFQ. To give an idea of --the limits on slow or average CPUs, here are, first, the limits of --bfq-sq for three different CPUs, on, respectively, an average laptop, -+BFQ has a non-null overhead, which limits the maximum IOPS that a CPU -+can process for a device scheduled with BFQ. To give an idea of the -+limits on slow or average CPUs, here are, first, the limits of bfq-mq -+and bfq for three different CPUs, on, respectively, an average laptop, - an old desktop, and a cheap embedded system, in case full hierarchical --support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): --- Intel i7-4850HQ: 250 KIOPS --- AMD A8-3850: 170 KIOPS --- ARM CortexTM-A53 Octa-core: 45 KIOPS -- --bfq-mq and bfq instances reach, instead, a higher sustainable --throughput. Their limits, on the same systems as above, are, with full --hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set --for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): -+support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for -+bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but -+CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): -+- Intel i7-4850HQ: 400 KIOPS -+- AMD A8-3850: 250 KIOPS -+- ARM CortexTM-A53 Octa-core: 80 KIOPS -+ -+As for bfq-sq, it cannot reach the above IOPS, because of the -+inherent, lower parallelism of legacy blk and of the components within -+it (including bfq-sq itself). In particular, results with -+CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits -+reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however -+provide a lower bound to the limits of bfq-sq. -+ -+Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and -+of course full hierarchical support is enabled), then the sustainable -+throughput with bfq-mq and bfq decreases, because all blkio.bfq* -+statistics are created and updated (Section 4-2). For bfq-mq and bfq, -+this leads to the following maximum sustainable throughputs, on the -+same systems as above: - - Intel i7-4850HQ: 310 KIOPS - - AMD A8-3850: 200 KIOPS - - ARM CortexTM-A53 Octa-core: 56 KIOPS - -+Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical -+support is enabled), then bfq-sq exhibits the following limits: -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS -+ - BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. -@@ -524,6 +541,22 @@ BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" - to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - -+As for cgroups-v1 (blkio controller), the exact set of stat files -+created, and kept up-to-date by bfq*, depends on whether -+CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all -+the stat files documented in -+Documentation/cgroup-v1/blkio-controller.txt. If, instead, -+CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files -+blkio.bfq*.io_service_bytes -+blkio.bfq*.io_service_bytes_recursive -+blkio.bfq*.io_serviced -+blkio.bfq*.io_serviced_recursive -+ -+The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum -+throughput sustainable with bfq*, because updating the blkio.bfq* -+stats is rather costly, especially for some of the stats enabled by -+CONFIG_DEBUG_BLK_CGROUP. -+ - Parameters to set - ----------------- - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 631e53d9150d..562b0ce581a7 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -15,7 +15,7 @@ - * file. - */ - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - - /* bfqg stats flags */ - enum bfqg_stats_flags { -@@ -155,6 +155,63 @@ static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) - bfqg_stats_update_group_wait_time(stats); - } - -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time - start_time); -+} -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq; - - /* -@@ -247,44 +304,10 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - } - #endif - --static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -- struct bfq_queue *bfqq, -- unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.queued, op, 1); -- bfqg_stats_end_empty_time(&bfqg->stats); -- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); --} -- --static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.queued, op, -1); --} -- --static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.merged, op, 1); --} -- --static void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, -- unsigned int op) --{ -- struct bfqg_stats *stats = &bfqg->stats; -- unsigned long long now = sched_clock(); -- -- if (time_after64(now, io_start_time)) -- blkg_rwstat_add(&stats->service_time, op, -- now - io_start_time); -- if (time_after64(io_start_time, start_time)) -- blkg_rwstat_add(&stats->wait_time, op, -- io_start_time - start_time); --} -- - /* @stats = 0 */ - static void bfqg_stats_reset(struct bfqg_stats *stats) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); -@@ -296,6 +319,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -+#endif - } - - /* @to += @from */ -@@ -304,6 +328,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - if (!to || !from) - return; - -+#ifdef CONFIG_DEBUG_BLK_CGROUP - /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); -@@ -316,6 +341,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+#endif - } - - /* -@@ -367,6 +393,7 @@ static void bfq_init_entity(struct bfq_entity *entity, - - static void bfqg_stats_exit(struct bfqg_stats *stats) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); -@@ -378,10 +405,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); -+#endif - } - - static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || -@@ -396,6 +425,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - bfqg_stats_exit(stats); - return -ENOMEM; - } -+#endif - - return 0; - } -@@ -1003,6 +1033,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, - return bfq_io_set_weight_legacy(of_css(of), NULL, weight); - } - -+#ifdef CONFIG_DEBUG_BLK_CGROUP - static int bfqg_print_stat(struct seq_file *sf, void *v) - { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -@@ -1108,6 +1139,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) - 0, false); - return 0; - } -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - - static struct bfq_group * - bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -@@ -1137,15 +1169,6 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - /* statistics, covers only the tasks in the bfqg */ - { -- .name = BFQ_CGROUP_FNAME(time), -- .private = offsetof(struct bfq_group, stats.time), -- .seq_show = bfqg_print_stat, -- }, -- { -- .name = BFQ_CGROUP_FNAME(sectors), -- .seq_show = bfqg_print_stat_sectors, -- }, -- { - .name = BFQ_CGROUP_FNAME(io_service_bytes), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, -@@ -1155,6 +1178,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, - }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors), -+ .seq_show = bfqg_print_stat_sectors, -+ }, - { - .name = BFQ_CGROUP_FNAME(io_service_time), - .private = offsetof(struct bfq_group, stats.service_time), -@@ -1175,18 +1208,10 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - - /* the same statictics which cover the bfqg and its descendants */ - { -- .name = BFQ_CGROUP_FNAME(time_recursive), -- .private = offsetof(struct bfq_group, stats.time), -- .seq_show = bfqg_print_stat_recursive, -- }, -- { -- .name = BFQ_CGROUP_FNAME(sectors_recursive), -- .seq_show = bfqg_print_stat_sectors_recursive, -- }, -- { - .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, -@@ -1196,6 +1221,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, - }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time_recursive), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, - { - .name = BFQ_CGROUP_FNAME(io_service_time_recursive), - .private = offsetof(struct bfq_group, stats.service_time), -@@ -1240,6 +1275,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - { } /* terminate */ - }; - -@@ -1257,25 +1293,6 @@ static struct cftype bfq_blkg_files[] = { - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - --static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -- struct bfq_queue *bfqq, unsigned int op) { } --static inline void --bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } --static inline void --bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } --static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, -- unsigned int op) { } --static inline void --bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -- struct bfq_group *curr_bfqg) { } --static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } --static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } --static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -- - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) {} - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index ab3b83d612c2..0c09609a6099 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4070,14 +4070,14 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - struct bfq_queue *in_serv_queue, *bfqq; - bool waiting_rq, idle_timer_disabled; - #endif - - spin_lock_irq(&bfqd->lock); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - -@@ -4091,7 +4091,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - #endif - spin_unlock_irq(&bfqd->lock); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bfqq = rq ? RQ_BFQQ(rq) : NULL; - if (!idle_timer_disabled && !bfqq) - return rq; -@@ -4691,7 +4691,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bool idle_timer_disabled = false; - unsigned int cmd_flags; - #endif -@@ -4726,7 +4726,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred -@@ -4744,7 +4744,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* - * Cache cmd_flags before releasing scheduler lock, because rq - * may disappear afterwards (for example, because of a request -@@ -4753,7 +4753,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - cmd_flags = rq->cmd_flags; - #endif - spin_unlock_irq(&bfqd->lock); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - if (!bfqq) - return; - /* -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 7ed2cc29be57..1cb05bb853d2 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -784,7 +784,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -812,7 +812,7 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; --#endif -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ - }; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -diff --git a/block/bfq.h b/block/bfq.h -index 15d326f466b7..47cd4d5a8c32 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -791,7 +791,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -819,7 +819,7 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; --#endif -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ - }; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch deleted file mode 100644 index 104325d6..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch +++ /dev/null @@ -1,4611 +0,0 @@ -From 7bd365a925748767d7ed807e5498f90bae0ebc25 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 14 Nov 2017 08:28:45 +0100 -Subject: [PATCH 01/23] block, bfq-mq: turn BUG_ON on request-size into WARN_ON - -BFQ has many checks of internal and external consistency. One of them -checks that an I/O request has still sectors to serve, if it happens -to be retired without being served. If the request has no sector to -serve, a BUG_ON signals the failure and causes the kernel to -terminate. Yet, from a crash report by a user [1], this condition may -happen to hold, in apparently correct functioning, for I/O with a -CD/DVD. - -To address this issue, this commit turns the above BUG_ON into a -WARN_ON. This commit also adds a companion WARN_ON on request -insertion into the scheduler. - -[1] https://groups.google.com/d/msg/bfq-iosched/DDOTJBroBa4/VyU1zUFtCgAJ - -Reported-by: Alexandre Frade <admfrade@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 0c09609a6099..0fc757ae7a42 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1540,6 +1540,8 @@ static void bfq_add_request(struct request *rq) - - BUG_ON(!RQ_BFQQ(rq)); - BUG_ON(RQ_BFQQ(rq) != bfqq); -+ WARN_ON(blk_rq_sectors(rq) == 0); -+ - elv_rb_add(&bfqq->sort_list, rq); - - /* -@@ -4962,7 +4964,7 @@ static void bfq_finish_request(struct request *rq) - rq_io_start_time_ns(rq), - rq->cmd_flags); - -- BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); - - if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - -From 1097d368a20456c88acd75b3184c68df38e8f7b8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sun, 12 Nov 2017 22:43:46 +0100 -Subject: [PATCH 02/23] block, bfq-sq, bfq-mq: consider also past I/O in soft - real-time detection - -BFQ privileges the I/O of soft real-time applications, such as video -players, to guarantee to these application a high bandwidth and a low -latency. In this respect, it is not easy to correctly detect when an -application is soft real-time. A particularly nasty false positive is -that of an I/O-bound application that occasionally happens to meet all -requirements to be deemed as soft real-time. After being detected as -soft real-time, such an application monopolizes the device. Fortunately, -BFQ will realize soon that the application is actually not soft -real-time and suspend every privilege. Yet, the application may happen -again to be wrongly detected as soft real-time, and so on. - -As highlighted by our tests, this problem causes BFQ to occasionally -fail to guarantee a high responsiveness, in the presence of heavy -background I/O workloads. The reason is that the background workload -happens to be detected as soft real-time, more or less frequently, -during the execution of the interactive task under test. To give an -idea, because of this problem, Libreoffice Writer occasionally takes 8 -seconds, instead of 3, to start up, if there are sequential reads and -writes in the background, on a Kingston SSDNow V300. - -This commit addresses this issue by leveraging the following facts. - -The reason why some applications are detected as soft real-time despite -all BFQ checks to avoid false positives, is simply that, during high -CPU or storage-device load, I/O-bound applications may happen to do -I/O slowly enough to meet all soft real-time requirements, and pass -all BFQ extra checks. Yet, this happens only for limited time periods: -slow-speed time intervals are usually interspersed between other time -intervals during which these applications do I/O at a very high speed. -To exploit these facts, this commit introduces a little change, in the -detection of soft real-time behavior, to systematically consider also -the recent past: the higher the speed was in the recent past, the -later next I/O should arrive for the application to be considered as -soft real-time. At the beginning of a slow-speed interval, the minimum -arrival time allowed for the next I/O usually happens to still be so -high, to fall *after* the end of the slow-speed period itself. As a -consequence, the application does not risk to be deemed as soft -real-time during the slow-speed interval. Then, during the next -high-speed interval, the application cannot, evidently, be deemed as -soft real-time (exactly because of its speed), and so on. - -This extra filtering proved to be rather effective: in the above test, -the frequency of false positives became so low that the start-up time -was 3 seconds in all iterations (apart from occasional outliers, -caused by page-cache-management issues, which are out of the scope of -this commit, and cannot be solved by an I/O scheduler). - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> ---- - block/bfq-mq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- - block/bfq-sq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- - 2 files changed, 162 insertions(+), 68 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 0fc757ae7a42..4d06d900f45e 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -3201,37 +3201,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * -- * Unfortunately, even a greedy application may happen to behave in an -- * isochronous way if the CPU load is high. In fact, the application may -- * stop issuing requests while the CPUs are busy serving other processes, -- * then restart, then stop again for a while, and so on. In addition, if -- * the disk achieves a low enough throughput with the request pattern -- * issued by the application (e.g., because the request pattern is random -- * and/or the device is slow), then the application may meet the above -- * bandwidth requirement too. To prevent such a greedy application to be -- * deemed as soft real-time, a further rule is used in the computation of -- * soft_rt_next_start: soft_rt_next_start must be higher than the current -- * time plus the maximum time for which the arrival of a request is waited -- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -- * This filters out greedy applications, as the latter issue instead their -- * next request as soon as possible after the last one has been completed -- * (in contrast, when a batch of requests is completed, a soft real-time -- * application spends some time processing data). -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. - * -- * Unfortunately, the last filter may easily generate false positives if -- * only bfqd->bfq_slice_idle is used as a reference time interval and one -- * or both the following cases occur: -- * 1) HZ is so low that the duration of a jiffy is comparable to or higher -- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -- * HZ=100. -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. -- * To address this issue, we do not use as a reference time interval just -- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -- * particular we add the minimum number of jiffies for which the filter -- * seems to be quite precise also in embedded systems and KVM/QEMU virtual -- * machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. - */ - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -@@ -3243,10 +3284,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - -- return max(bfqq->last_idle_bklogged + -- HZ * bfqq->service_from_backlogged / -- bfqd->bfq_wr_max_softrt_rate, -- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /** -@@ -4395,10 +4437,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->split_time = bfq_smallest_from_now(); - - /* -- * Set to the value for which bfqq will not be deemed as -- * soft rt when it becomes backlogged. -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. - */ -- bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ bfqq->soft_rt_next_start = jiffies; - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 4bbd7f4c0154..987dc255c82c 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3089,37 +3089,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * -- * Unfortunately, even a greedy application may happen to behave in an -- * isochronous way if the CPU load is high. In fact, the application may -- * stop issuing requests while the CPUs are busy serving other processes, -- * then restart, then stop again for a while, and so on. In addition, if -- * the disk achieves a low enough throughput with the request pattern -- * issued by the application (e.g., because the request pattern is random -- * and/or the device is slow), then the application may meet the above -- * bandwidth requirement too. To prevent such a greedy application to be -- * deemed as soft real-time, a further rule is used in the computation of -- * soft_rt_next_start: soft_rt_next_start must be higher than the current -- * time plus the maximum time for which the arrival of a request is waited -- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -- * This filters out greedy applications, as the latter issue instead their -- * next request as soon as possible after the last one has been completed -- * (in contrast, when a batch of requests is completed, a soft real-time -- * application spends some time processing data). -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. - * -- * Unfortunately, the last filter may easily generate false positives if -- * only bfqd->bfq_slice_idle is used as a reference time interval and one -- * or both the following cases occur: -- * 1) HZ is so low that the duration of a jiffy is comparable to or higher -- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -- * HZ=100. -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. -- * To address this issue, we do not use as a reference time interval just -- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -- * particular we add the minimum number of jiffies for which the filter -- * seems to be quite precise also in embedded systems and KVM/QEMU virtual -- * machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. - */ - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -@@ -3131,10 +3172,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - -- return max(bfqq->last_idle_bklogged + -- HZ * bfqq->service_from_backlogged / -- bfqd->bfq_wr_max_softrt_rate, -- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /** -@@ -4167,10 +4209,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->split_time = bfq_smallest_from_now(); - - /* -- * Set to the value for which bfqq will not be deemed as -- * soft rt when it becomes backlogged. -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. - */ -- bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ bfqq->soft_rt_next_start = jiffies; - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; - -From 2a09b505660c81dbb80a5d68c9bc558c326d041f Mon Sep 17 00:00:00 2001 -From: Chiara Bruschi <bruschi.chiara@outlook.it> -Date: Thu, 7 Dec 2017 09:57:19 +0100 -Subject: [PATCH 03/23] block, bfq-mq: fix occurrences of request - prepare/finish methods' old names - -Commits 'b01f1fa3bb19' (Port of "blk-mq-sched: unify request prepare -methods") and 'cc10d2d7d2c1' (Port of "blk-mq-sched: unify request -finished methods") changed the old names of current bfq_prepare_request -and bfq_finish_request methods, but left them unchanged elsewhere in -the code (related comments, part of function name bfq_put_rq_priv_body). - -This commit fixes every occurrence of the old names of these methods -by changing them into the current names. - -Fixes: b01f1fa3bb19 (Port of "blk-mq-sched: unify request prepare methods") -Fixes: cc10d2d7d2c1 (Port of "blk-mq-sched: unify request finished methods") -Reviewed-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Federico Motta <federico@willer.it> -Signed-off-by: Chiara Bruschi <bruschi.chiara@outlook.it> ---- - block/bfq-mq-iosched.c | 38 +++++++++++++++++++------------------- - 1 file changed, 19 insertions(+), 19 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 4d06d900f45e..8f8d5eccb016 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4018,20 +4018,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - /* - * TESTING: reset DISP_LIST flag, because: 1) - * this rq this request has passed through -- * get_rq_private, 2) then it will have -- * put_rq_private invoked on it, and 3) in -- * put_rq_private we use this flag to check -- * that put_rq_private is not invoked on -- * requests for which get_rq_private has been -- * invoked. -+ * bfq_prepare_request, 2) then it will have -+ * bfq_finish_request invoked on it, and 3) in -+ * bfq_finish_request we use this flag to check -+ * that bfq_finish_request is not invoked on -+ * requests for which bfq_prepare_request has -+ * been invoked. - */ - rq->rq_flags &= ~RQF_DISP_LIST; - goto inc_in_driver_start_rq; - } - - /* -- * We exploit the put_rq_private hook to decrement -- * rq_in_driver, but put_rq_private will not be -+ * We exploit the bfq_finish_request hook to decrement -+ * rq_in_driver, but bfq_finish_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, -@@ -4040,14 +4040,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * bfq_schedule_dispatch to be invoked uselessly. - * - * As for implementing an exact solution, the -- * put_request hook, if defined, is probably invoked -- * also on this request. So, by exploiting this hook, -- * we could 1) increment rq_in_driver here, and 2) -- * decrement it in put_request. Such a solution would -- * let the value of the counter be always accurate, -- * but it would entail using an extra interface -- * function. This cost seems higher than the benefit, -- * being the frequency of non-elevator-private -+ * bfq_finish_request hook, if defined, is probably -+ * invoked also on this request. So, by exploiting -+ * this hook, we could 1) increment rq_in_driver here, -+ * and 2) decrement it in bfq_finish_request. Such a -+ * solution would let the value of the counter be -+ * always accurate, but it would entail using an extra -+ * interface function. This cost seems higher than the -+ * benefit, being the frequency of non-elevator-private - * requests very low. - */ - goto start_rq; -@@ -4963,7 +4963,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - } - } - --static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) -+static void bfq_finish_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "put_request_body: allocated %d", bfqq->allocated); -@@ -5019,7 +5019,7 @@ static void bfq_finish_request(struct request *rq) - spin_lock_irqsave(&bfqd->lock, flags); - - bfq_completed_request(bfqq, bfqd); -- bfq_put_rq_priv_body(bfqq); -+ bfq_finish_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { -@@ -5042,7 +5042,7 @@ static void bfq_finish_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } -- bfq_put_rq_priv_body(bfqq); -+ bfq_finish_request_body(bfqq); - } - - rq->elv.priv[0] = NULL; - -From 4df19943c3a767df453abea3d2ac3433c3326ce0 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 16 Nov 2017 18:38:13 +0100 -Subject: [PATCH 04/23] block, bfq-sq, bfq-mq: add missing rq_pos_tree update - on rq removal - -If two processes do I/O close to each other, then BFQ merges the -bfq_queues associated with these processes, to get a more sequential -I/O, and thus a higher throughput. In this respect, to detect whether -two processes are doing I/O close to each other, BFQ keeps a list of -the head-of-line I/O requests of all active bfq_queues. The list is -ordered by initial sectors, and implemented through a red-black tree -(rq_pos_tree). - -Unfortunately, the update of the rq_pos_tree was incomplete, because -the tree was not updated on the removal of the head-of-line I/O -request of a bfq_queue, in case the queue did not remain empty. This -commit adds the missing update. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> ---- - block/bfq-mq-iosched.c | 3 +++ - block/bfq-sq-iosched.c | 3 +++ - 2 files changed, 6 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8f8d5eccb016..603191c9008f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1729,6 +1729,9 @@ static void bfq_remove_request(struct request_queue *q, - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); - } - - if (rq->cmd_flags & REQ_META) { -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 987dc255c82c..ea90ace79e49 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -1669,6 +1669,9 @@ static void bfq_remove_request(struct request *rq) - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); - } - - if (rq->cmd_flags & REQ_META) { - -From b844e345140aaea957d84a21d2aa67588b020cd5 Mon Sep 17 00:00:00 2001 -From: Angelo Ruocco <angeloruocco90@gmail.com> -Date: Mon, 18 Dec 2017 08:28:08 +0100 -Subject: [PATCH 05/23] block, bfq-sq, bfq-mq: check low_latency flag in - bfq_bfqq_save_state() - -A just-created bfq_queue will certainly be deemed as interactive on -the arrival of its first I/O request, if the low_latency flag is -set. Yet, if the queue is merged with another queue on the arrival of -its first I/O request, it will not have the chance to be flagged as -interactive. Nevertheless, if the queue is then split soon enough, it -has to be flagged as interactive after the split. - -To handle this early-merge scenario correctly, BFQ saves the state of -the queue, on the merge, as if the latter had already been deemed -interactive. So, if the queue is split soon, it will get -weight-raised, because the previous state of the queue is resumed on -the split. - -Unfortunately, in the act of saving the state of the newly-created -queue, BFQ doesn't check whether the low_latency flag is set, and this -causes early-merged queues to be then weight-raised, on queue splits, -even if low_latency is off. This commit addresses this problem by -adding the missing check. - -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 3 ++- - block/bfq-sq-iosched.c | 3 ++- - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 603191c9008f..ff9776c8836a 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2231,7 +2231,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - if (unlikely(bfq_bfqq_just_created(bfqq) && -- !bfq_bfqq_in_large_burst(bfqq))) { -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { - /* - * bfqq being merged ritgh after being created: bfqq - * would have deserved interactive weight raising, but -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index ea90ace79e49..3a2d764e760c 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2109,7 +2109,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - if (unlikely(bfq_bfqq_just_created(bfqq) && -- !bfq_bfqq_in_large_burst(bfqq))) { -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { - /* - * bfqq being merged ritgh after being created: bfqq - * would have deserved interactive weight raising, but - -From 4cc6896fe1de2e0b4de151a6e70658f10b9ec2fa Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Fri, 27 Oct 2017 11:12:14 +0200 -Subject: [PATCH 06/23] block, bfq-sq, bfq-mq: let a queue be merged only - shortly after starting I/O - -In BFQ and CFQ, two processes are said to be cooperating if they do -I/O in such a way that the union of their I/O requests yields a -sequential I/O pattern. To get such a sequential I/O pattern out of -the non-sequential pattern of each cooperating process, BFQ and CFQ -merge the queues associated with these processes. In more detail, -cooperating processes, and thus their associated queues, usually -start, or restart, to do I/O shortly after each other. This is the -case, e.g., for the I/O threads of KVM/QEMU and of the dump -utility. Basing on this assumption, this commit allows a bfq_queue to -be merged only during a short time interval (100ms) after it starts, -or re-starts, to do I/O. This filtering provides two important -benefits. - -First, it greatly reduces the probability that two non-cooperating -processes have their queues merged by mistake, if they just happen to -do I/O close to each other for a short time interval. These spurious -merges cause loss of service guarantees. A low-weight bfq_queue may -unjustly get more than its expected share of the throughput: if such a -low-weight queue is merged with a high-weight queue, then the I/O for -the low-weight queue is served as if the queue had a high weight. This -may damage other high-weight queues unexpectedly. For instance, -because of this issue, lxterminal occasionally took 7.5 seconds to -start, instead of 6.5 seconds, when some sequential readers and -writers did I/O in the background on a FUJITSU MHX2300BT HDD. The -reason is that the bfq_queues associated with some of the readers or -the writers were merged with the high-weight queues of some processes -that had to do some urgent but little I/O. The readers then exploited -the inherited high weight for all or most of their I/O, during the -start-up of terminal. The filtering introduced by this commit -eliminated any outlier caused by spurious queue merges in our start-up -time tests. - -This filtering also provides a little boost of the throughput -sustainable by BFQ: 3-4%, depending on the CPU. The reason is that, -once a bfq_queue cannot be merged any longer, this commit makes BFQ -stop updating the data needed to handle merging for the queue. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> ---- - block/bfq-mq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- - block/bfq-mq.h | 1 + - block/bfq-sched.c | 4 ++++ - block/bfq-sq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- - block/bfq.h | 2 ++ - 5 files changed, 113 insertions(+), 22 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index ff9776c8836a..8b17b25a3c30 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -119,6 +119,20 @@ static const int bfq_async_charge_factor = 10; - /* Default timeout values, in jiffies, approximating CFQ defaults. */ - static const int bfq_timeout = (HZ / 8); - -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -389,6 +403,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - return bfqq; - } - -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct rb_node **p, *parent; -@@ -399,6 +420,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqq->pos_root = NULL; - } - -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) -@@ -2081,6 +2110,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) - { -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] too late for bfq%d to be merged", -+ __func__, new_bfqq->pid); -+ return false; -+ } -+ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; -@@ -2149,6 +2185,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - { - struct bfq_queue *in_service_bfqq, *new_bfqq; - -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -@@ -3338,17 +3391,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - -- /* -- * Increase service_from_backlogged before next statement, -- * because the possible next invocation of -- * bfq_bfqq_charge_time would likely inflate -- * entity->service. In contrast, service_from_backlogged must -- * contain real service, to enable the soft real-time -- * heuristic to correctly compute the bandwidth consumed by -- * bfqq. -- */ -- bfqq->service_from_backlogged += entity->service; -- - /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 1cb05bb853d2..a5947b203ef2 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -337,6 +337,7 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ - }; - - /** -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 616c0692335a..9d261dd428e4 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -939,6 +939,10 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st; - -+ if (!bfqq->service_from_backlogged) -+ bfqq->first_IO_time = jiffies; -+ -+ bfqq->service_from_backlogged += served; - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 3a2d764e760c..cd00a41ca35d 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -113,6 +113,20 @@ static const int bfq_async_charge_factor = 10; - /* Default timeout values, in jiffies, approximating CFQ defaults. */ - static const int bfq_timeout = (HZ / 8); - -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -351,6 +365,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - return bfqq; - } - -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct rb_node **p, *parent; -@@ -361,6 +382,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqq->pos_root = NULL; - } - -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) -@@ -1960,6 +1989,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) - { -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] too late for bfq%d to be merged", -+ __func__, new_bfqq->pid); -+ return false; -+ } -+ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; -@@ -2028,6 +2064,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - { - struct bfq_queue *in_service_bfqq, *new_bfqq; - -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -@@ -3226,17 +3279,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - -- /* -- * Increase service_from_backlogged before next statement, -- * because the possible next invocation of -- * bfq_bfqq_charge_time would likely inflate -- * entity->service. In contrast, service_from_backlogged must -- * contain real service, to enable the soft real-time -- * heuristic to correctly compute the bandwidth consumed by -- * bfqq. -- */ -- bfqq->service_from_backlogged += entity->service; -- - /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service -diff --git a/block/bfq.h b/block/bfq.h -index 47cd4d5a8c32..59539adc00a5 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -329,6 +329,8 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ - }; - - /** - -From 157f39c43ab182280634cd4f6335d0187b3741a0 Mon Sep 17 00:00:00 2001 -From: Angelo Ruocco <angeloruocco90@gmail.com> -Date: Mon, 11 Dec 2017 14:19:54 +0100 -Subject: [PATCH 07/23] block, bfq-sq, bfq-mq: remove superfluous check in - queue-merging setup - -When two or more processes do I/O in a way that the their requests are -sequential in respect to one another, BFQ merges the bfq_queues associated -with the processes. This way the overall I/O pattern becomes sequential, -and thus there is a boost in througput. -These cooperating processes usually start or restart to do I/O shortly -after each other. So, in order to avoid merging non-cooperating processes, -BFQ ensures that none of these queues has been in weight raising for too -long. - -In this respect, from commit "block, bfq-sq, bfq-mq: let a queue be merged -only shortly after being created", BFQ checks whether any queue (and not -only weight-raised ones) is doing I/O continuously from too long to be -merged. - -This new additional check makes the first one useless: a queue doing -I/O from long enough, if being weight-raised, is also a queue in -weight raising for too long to be merged. Accordingly, this commit -removes the first check. - -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.com> ---- - block/bfq-mq-iosched.c | 53 ++++---------------------------------------------- - block/bfq-sq-iosched.c | 53 ++++---------------------------------------------- - 2 files changed, 8 insertions(+), 98 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8b17b25a3c30..f5db8613a70f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2140,20 +2140,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - return true; - } - --/* -- * If this function returns true, then bfqq cannot be merged. The idea -- * is that true cooperation happens very early after processes start -- * to do I/O. Usually, late cooperations are just accidental false -- * positives. In case bfqq is weight-raised, such false positives -- * would evidently degrade latency guarantees for bfqq. -- */ --static bool wr_from_too_long(struct bfq_queue *bfqq) --{ -- return bfqq->wr_coeff > 1 && -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- msecs_to_jiffies(100)); --} -- - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2167,11 +2153,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - * -- * Weight-raised queues can be merged only if their weight-raising -- * period has just started. In fact cooperating processes are usually -- * started together. Thus, with this filter we avoid false positives -- * that would jeopardize low-latency guarantees. -- * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the -@@ -2205,15 +2186,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -- if (io_struct && wr_from_too_long(bfqq) && -- likely(bfqq != &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have looked for coop, but bfq%d wr", -- bfqq->pid); -- -- if (!io_struct || -- wr_from_too_long(bfqq) || -- unlikely(bfqq == &bfqd->oom_bfqq)) -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - - /* If there is only one backlogged queue, don't search. */ -@@ -2223,17 +2196,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- wr_from_too_long(in_service_bfqq) -- && likely(in_service_bfqq == &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have tried merge with in-service-queue, but wr"); -- -- if (!in_service_bfqq || in_service_bfqq == bfqq -- || wr_from_too_long(in_service_bfqq) || -- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -- goto check_scheduled; -- -- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -@@ -2245,21 +2209,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ --check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -- bfq_may_be_close_cooperator(bfqq, new_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have merged with bfq%d, but wr", -- new_bfqq->pid); -- -- if (new_bfqq && !wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index cd00a41ca35d..d8a358e5e284 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2019,20 +2019,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - return true; - } - --/* -- * If this function returns true, then bfqq cannot be merged. The idea -- * is that true cooperation happens very early after processes start -- * to do I/O. Usually, late cooperations are just accidental false -- * positives. In case bfqq is weight-raised, such false positives -- * would evidently degrade latency guarantees for bfqq. -- */ --static bool wr_from_too_long(struct bfq_queue *bfqq) --{ -- return bfqq->wr_coeff > 1 && -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- msecs_to_jiffies(100)); --} -- - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2046,11 +2032,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - * -- * Weight-raised queues can be merged only if their weight-raising -- * period has just started. In fact cooperating processes are usually -- * started together. Thus, with this filter we avoid false positives -- * that would jeopardize low-latency guarantees. -- * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the -@@ -2084,15 +2065,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -- if (io_struct && wr_from_too_long(bfqq) && -- likely(bfqq != &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have looked for coop, but bfq%d wr", -- bfqq->pid); -- -- if (!io_struct || -- wr_from_too_long(bfqq) || -- unlikely(bfqq == &bfqd->oom_bfqq)) -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - - /* If there is only one backlogged queue, don't search. */ -@@ -2102,17 +2075,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -- && likely(in_service_bfqq == &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have tried merge with in-service-queue, but wr"); -- -- if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -- goto check_scheduled; -- -- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -@@ -2124,21 +2088,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ --check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -- bfq_may_be_close_cooperator(bfqq, new_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have merged with bfq%d, but wr", -- new_bfqq->pid); -- -- if (new_bfqq && !wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - - -From b82eb91d87f172aba7eb5eb98e8d5e2a621adf51 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 30 Nov 2017 17:48:28 +0100 -Subject: [PATCH 08/23] block, bfq-sq, bfq-mq: increase threshold to deem I/O - as random - -If two processes do I/O close to each other, i.e., are cooperating -processes in BFQ (and CFQ'S) nomenclature, then BFQ merges their -associated bfq_queues, so as to get sequential I/O from the union of -the I/O requests of the processes, and thus reach a higher -throughput. A merged queue is then split if its I/O stops being -sequential. In this respect, BFQ deems the I/O of a bfq_queue as -(mostly) sequential only if less than 4 I/O requests are random, out -of the last 32 requests inserted into the queue. - -Unfortunately, extensive testing (with the interleaved_io benchmark of -the S suite [1], and with real applications spawning cooperating -processes) has clearly shown that, with such a low threshold, only a -rather low I/O throughput may be reached when several cooperating -processes do I/O. In particular, the outcome of each test run was -bimodal: if queue merging occurred and was stable during the test, -then the throughput was close to the peak rate of the storage device, -otherwise the throughput was arbitrarily low (usually around 1/10 of -the peak rate with a rotational device). The probability to get the -unlucky outcomes grew with the number of cooperating processes: it was -already significant with 5 processes, and close to one with 7 or more -processes. - -The cause of the low throughput in the unlucky runs was that the -merged queues containing the I/O of these cooperating processes were -soon split, because they contained more random I/O requests than those -tolerated by the 4/32 threshold, but -- that I/O would have however allowed the storage device to reach - peak throughput or almost peak throughput; -- in contrast, the I/O of these processes, if served individually - (from separate queues) yielded a rather low throughput. - -So we repeated our tests with increasing values of the threshold, -until we found the minimum value (19) for which we obtained maximum -throughput, reliably, with at least up to 9 cooperating -processes. Then we checked that the use of that higher threshold value -did not cause any regression for any other benchmark in the suite [1]. -This commit raises the threshold to such a higher value. - -[1] https://github.com/Algodev-github/S - -Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f5db8613a70f..cb5f49ddecb6 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -145,7 +145,7 @@ static struct kmem_cache *bfq_pool; - #define BFQQ_SEEK_THR (sector_t)(8 * 100) - #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) - #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) - - /* Min number of samples required to perform peak-rate update */ - #define BFQ_RATE_MIN_SAMPLES 32 -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index d8a358e5e284..e1c6dc651be1 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -139,7 +139,7 @@ static struct kmem_cache *bfq_pool; - #define BFQQ_SEEK_THR (sector_t)(8 * 100) - #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) - #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) - - /* Min number of samples required to perform peak-rate update */ - #define BFQ_RATE_MIN_SAMPLES 32 - -From b739dda4e4b3a1cbbc905f86f9fbb0860b068ce7 Mon Sep 17 00:00:00 2001 -From: Chiara Bruschi <bruschi.chiara@outlook.it> -Date: Mon, 11 Dec 2017 18:55:26 +0100 -Subject: [PATCH 09/23] block, bfq-sq, bfq-mq: specify usage condition of - delta_us in bfq_log_bfqq call - -Inside the function bfq_completed_request the value of a variable -called delta_us is computed as current request completion time. -delta_us is used inside a call to the function bfq_log_bfqq as divisor -in a division operation to compute a rate value, but no check makes -sure that delta_us has non-zero value. A divisor with value 0 leads -to a division error that could result in a kernel oops (therefore -unstable/unreliable system state) and consequently cause kernel panic -if resources are unavailable after the system fault. - -This commit fixes this call to bfq_log_bfqq specifying the condition -that allows delta_us to be safely used as divisor. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Chiara Bruschi <bruschi.chiara@outlook.it> ---- - block/bfq-mq-iosched.c | 5 ++++- - block/bfq-sq-iosched.c | 5 ++++- - 2 files changed, 8 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index cb5f49ddecb6..6ce2c0789046 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4904,9 +4904,12 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, - "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -- >>BFQ_RATE_SHIFT, -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e1c6dc651be1..eff4c4edf5a0 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -4565,9 +4565,12 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - - bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -- >>BFQ_RATE_SHIFT, -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* - -From ae4310c13eca762644734d53074d8456c85e2dec Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Tue, 19 Dec 2017 12:07:12 +0100 -Subject: [PATCH 10/23] block, bfq-mq: limit tags for writes and async I/O - -Asynchronous I/O can easily starve synchronous I/O (both sync reads -and sync writes), by consuming all request tags. Similarly, storms of -synchronous writes, such as those that sync(2) may trigger, can starve -synchronous reads. In their turn, these two problems may also cause -BFQ to loose control on latency for interactive and soft real-time -applications. For example, on a PLEXTOR PX-256M5S SSD, LibreOffice -Writer takes 0.6 seconds to start if the device is idle, but it takes -more than 45 seconds (!) if there are sequential writes in the -background. - -This commit addresses this issue by limiting the maximum percentage of -tags that asynchronous I/O requests and synchronous write requests can -consume. In particular, this commit grants a higher threshold to -synchronous writes, to prevent the latter from being starved by -asynchronous I/O. - -According to the above test, LibreOffice Writer now starts in about -1.2 seconds on average, regardless of the background workload, and -apart from some rare outlier. To check this improvement, run, e.g., -sudo ./comm_startup_lat.sh bfq-mq 5 5 seq 10 "lowriter --terminate_after_init" -for the comm_startup_lat benchmark in the S suite [1]. - -[1] https://github.com/Algodev-github/S - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ - block/bfq-mq.h | 12 ++++++++ - 2 files changed, 89 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6ce2c0789046..f384f5566672 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -362,6 +362,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, - } - } - -+/* -+ * See the comments on bfq_limit_depth for the purpose of -+ * the depths set in the function. -+ */ -+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) -+{ -+ bfqd->sb_shift = bt->sb.shift; -+ -+ /* -+ * In-word depths if no bfq_queue is being weight-raised: -+ * leaving 25% of tags only for sync reads. -+ * -+ * In next formulas, right-shift the value -+ * (1U<<bfqd->sb_shift), instead of computing directly -+ * (1U<<(bfqd->sb_shift - something)), to be robust against -+ * any possible value of bfqd->sb_shift, without having to -+ * limit 'something'. -+ */ -+ /* no more than 50% of tags for async I/O */ -+ bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); -+ /* -+ * no more than 75% of tags for sync writes (25% extra tags -+ * w.r.t. async I/O, to prevent async I/O from starving sync -+ * writes) -+ */ -+ bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); -+ -+ /* -+ * In-word depths in case some bfq_queue is being weight- -+ * raised: leaving ~63% of tags for sync reads. This is the -+ * highest percentage for which, in our tests, application -+ * start-up times didn't suffer from any regression due to tag -+ * shortage. -+ */ -+ /* no more than ~18% of tags for async I/O */ -+ bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); -+ /* no more than ~37% of tags for sync writes (~20% extra tags) */ -+ bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); -+} -+ -+/* -+ * Async I/O can easily starve sync I/O (both sync reads and sync -+ * writes), by consuming all tags. Similarly, storms of sync writes, -+ * such as those that sync(2) may trigger, can starve sync reads. -+ * Limit depths of async I/O and sync writes so as to counter both -+ * problems. -+ */ -+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) -+{ -+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data); -+ struct bfq_data *bfqd = data->q->elevator->elevator_data; -+ struct sbitmap_queue *bt; -+ -+ if (op_is_sync(op) && !op_is_write(op)) -+ return; -+ -+ if (data->flags & BLK_MQ_REQ_RESERVED) { -+ if (unlikely(!tags->nr_reserved_tags)) { -+ WARN_ON_ONCE(1); -+ return; -+ } -+ bt = &tags->breserved_tags; -+ } else -+ bt = &tags->bitmap_tags; -+ -+ if (unlikely(bfqd->sb_shift != bt->sb.shift)) -+ bfq_update_depths(bfqd, bt); -+ -+ data->shallow_depth = -+ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; -+ -+ bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", -+ __func__, bfqd->wr_busy_queues, op_is_sync(op), -+ data->shallow_depth); -+} -+ - static struct bfq_queue * - bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, -@@ -5812,6 +5888,7 @@ static struct elv_fs_entry bfq_attrs[] = { - - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { -+ .limit_depth = bfq_limit_depth, - .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index a5947b203ef2..458099ee0308 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -619,6 +619,18 @@ struct bfq_data { - struct bfq_queue *bio_bfqq; - /* Extra flag used only for TESTING */ - bool bio_bfqq_set; -+ -+ /* -+ * Cached sbitmap shift, used to compute depth limits in -+ * bfq_update_depths. -+ */ -+ unsigned int sb_shift; -+ -+ /* -+ * Depth limits used in bfq_limit_depth (see comments on the -+ * function) -+ */ -+ unsigned int word_depths[2][2]; - }; - - enum bfqq_state_flags { - -From 402e5f6b59662d290ab2b3c10b0016207a63ad21 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 21 Dec 2017 15:51:39 +0100 -Subject: [PATCH 11/23] bfq-sq, bfq-mq: limit sectors served with interactive - weight raising - -To maximise responsiveness, BFQ raises the weight, and performs device -idling, for bfq_queues associated with processes deemed as -interactive. In particular, weight raising has a maximum duration, -equal to the time needed to start a large application. If a -weight-raised process goes on doing I/O beyond this maximum duration, -it loses weight-raising. - -This mechanism is evidently vulnerable to the following false -positives: I/O-bound applications that will go on doing I/O for much -longer than the duration of weight-raising. These applications have -basically no benefit from being weight-raised at the beginning of -their I/O. On the opposite end, while being weight-raised, these -applications -a) unjustly steal throughput to applications that may truly need -low latency; -b) make BFQ uselessly perform device idling; device idling results -in loss of device throughput with most flash-based storage, and may -increase latencies when used purposelessly. - -This commit adds a countermeasure to reduce both the above -problems. To introduce this countermeasure, we provide the following -extra piece of information (full details in the comments added by this -commit). During the start-up of the large application used as a -reference to set the duration of weight-raising, involved processes -transfer at most ~110K sectors each. Accordingly, a process initially -deemed as interactive has no right to be weight-raised any longer, -once transferred 110K sectors or more. - -Basing on this consideration, this commit early-ends weight-raising -for a bfq_queue if the latter happens to have received an amount of -service at least equal to 110K sectors (actually, a little bit more, -to keep a safety margin). I/O-bound applications that reach a high -throughput, such as file copy, get to this threshold much before the -allowed weight-raising period finishes. Thus this early ending of -weight-raising reduces the amount of time during which these -applications cause the problems described above. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ - block/bfq-mq.h | 5 +++ - block/bfq-sched.c | 3 ++ - block/bfq-sq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ - block/bfq.h | 5 +++ - 5 files changed, 163 insertions(+), 18 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f384f5566672..63fdd16dec3c 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -162,15 +162,17 @@ static struct kmem_cache *bfq_pool; - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. -- * In particular, R is the peak rate of the reference device (see below), -- * and T is a reference time: given the systems that are likely to be -- * installed on the reference device according to its speed class, T is -- * about the maximum time needed, under BFQ and while reading two files in -- * parallel, to load typical large applications on these systems. -- * In practice, the slower/faster the device at hand is, the more/less it -- * takes to load applications with respect to the reference device. -- * Accordingly, the longer/shorter BFQ grants weight raising to interactive -- * applications. -+ * In particular, R is the peak rate of the reference device (see -+ * below), and T is a reference time: given the systems that are -+ * likely to be installed on the reference device according to its -+ * speed class, T is about the maximum time needed, under BFQ and -+ * while reading two files in parallel, to load typical large -+ * applications on these systems (see the comments on -+ * max_service_from_wr below, for more details on how T is obtained). -+ * In practice, the slower/faster the device at hand is, the more/less -+ * it takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to -+ * interactive applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; -@@ -207,6 +209,60 @@ static int T_slow[2]; - static int T_fast[2]; - static int device_speed_thresh[2]; - -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transferred. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -@@ -1361,6 +1417,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { -+ bfqq->service_from_wr = 0; - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -@@ -3980,6 +4037,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - "back to interactive wr"); - } - } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "[%s] too much service", -+ __func__); -+ } - } - /* - * To improve latency (for this or other queues), immediately -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 458099ee0308..9a5ce1168ff5 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -331,6 +331,11 @@ struct bfq_queue { - * last transition from idle to backlogged. - */ - unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; - /* - * Value of wr start time when switching to soft rt - */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 9d261dd428e4..4e6c5232e2fb 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -942,6 +942,9 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - if (!bfqq->service_from_backlogged) - bfqq->first_IO_time = jiffies; - -+ if (bfqq->wr_coeff > 1) -+ bfqq->service_from_wr += served; -+ - bfqq->service_from_backlogged += served; - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index eff4c4edf5a0..486493aafaf8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -156,15 +156,17 @@ static struct kmem_cache *bfq_pool; - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. -- * In particular, R is the peak rate of the reference device (see below), -- * and T is a reference time: given the systems that are likely to be -- * installed on the reference device according to its speed class, T is -- * about the maximum time needed, under BFQ and while reading two files in -- * parallel, to load typical large applications on these systems. -- * In practice, the slower/faster the device at hand is, the more/less it -- * takes to load applications with respect to the reference device. -- * Accordingly, the longer/shorter BFQ grants weight raising to interactive -- * applications. -+ * In particular, R is the peak rate of the reference device (see -+ * below), and T is a reference time: given the systems that are -+ * likely to be installed on the reference device according to its -+ * speed class, T is about the maximum time needed, under BFQ and -+ * while reading two files in parallel, to load typical large -+ * applications on these systems (see the comments on -+ * max_service_from_wr below, for more details on how T is obtained). -+ * In practice, the slower/faster the device at hand is, the more/less -+ * it takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to -+ * interactive applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; -@@ -201,6 +203,60 @@ static int T_slow[2]; - static int T_fast[2]; - static int device_speed_thresh[2]; - -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transfered. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -@@ -1246,6 +1302,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { -+ bfqq->service_from_wr = 0; - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -@@ -3794,6 +3851,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - "back to interactive wr"); - } - } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "[%s] too much service", -+ __func__); -+ } - } - /* - * To improve latency (for this or other queues), immediately -diff --git a/block/bfq.h b/block/bfq.h -index 59539adc00a5..0cd7a3f251a7 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -323,6 +323,11 @@ struct bfq_queue { - * last transition from idle to backlogged. - */ - unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; - /* - * Value of wr start time when switching to soft rt - */ - -From 59efebb94b2f9bac653faf62dadb45b83bd27fa7 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Thu, 4 Jan 2018 16:29:58 +0100 -Subject: [PATCH 12/23] bfq-sq, bfq-mq: put async queues for root bfq groups - too -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -For each pair [device for which bfq is selected as I/O scheduler, -group in blkio/io], bfq maintains a corresponding bfq group. Each such -bfq group contains a set of async queues, with each async queue -created on demand, i.e., when some I/O request arrives for it. On -creation, an async queue gets an extra reference, to make sure that -the queue is not freed as long as its bfq group exists. Accordingly, -to allow the queue to be freed after the group exited, this extra -reference must released on group exit. - -The above holds also for a bfq root group, i.e., for the bfq group -corresponding to the root blkio/io root for a given device. Yet, by -mistake, the references to the existing async queues of a root group -are not released when the latter exits. This causes a memory leak when -the instance of bfq for a given device exits. In a similar vein, -bfqg_stats_xfer_dead is not executed for a root group. - -This commit fixes bfq_pd_offline so that the latter executes the above -missing operations for a root group too. - -Reported-by: Holger Hoffstätte <holger@applied-asynchrony.com> -Reported-by: Guoqing Jiang <gqjiang@suse.com> -Signed-off-by: Davide Ferrari <davideferrari8@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 8 +++++--- - 1 file changed, 5 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 562b0ce581a7..45fefb2e2d57 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -885,13 +885,13 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - - entity = bfqg->my_entity; - -- if (!entity) /* root group */ -- return; -- - #ifdef BFQ_MQ - spin_lock_irqsave(&bfqd->lock, flags); - #endif - -+ if (!entity) /* root group */ -+ goto put_async_queues; -+ - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -926,6 +926,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - BUG_ON(bfqg->sched_data.in_service_entity); - - __bfq_deactivate_entity(entity, false); -+ -+put_async_queues: - bfq_put_async_queues(bfqd, bfqg); - - #ifdef BFQ_MQ - -From 2dfbaaaf95054e2da3ededc0deb1ba5a4f589e53 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 8 Jan 2018 19:38:45 +0100 -Subject: [PATCH 13/23] bfq-sq, bfq-mq: release oom-queue ref to root group on - exit - -On scheduler init, a reference to the root group, and a reference to -its corresponding blkg are taken for the oom queue. Yet these -references are not released on scheduler exit, which prevents these -objects from be freed. This commit adds the missing reference -releases. - -Reported-by: Davide Ferrari <davideferrari8@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 3 +++ - block/bfq-sq-iosched.c | 3 +++ - 2 files changed, 6 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 63fdd16dec3c..b82c52fabf91 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5507,6 +5507,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+ /* release oom-queue reference to root group */ -+ bfqg_and_blkg_put(bfqd->root_group); -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 486493aafaf8..851af055664d 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5052,6 +5052,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+ /* release oom-queue reference to root group */ -+ bfqg_put(bfqd->root_group); -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - -From 13efe00c8292d78d223e1090a7f36426e360eb38 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 8 Jan 2018 19:40:38 +0100 -Subject: [PATCH 14/23] block, bfq-sq, bfq-mq: trace get and put of bfq groups - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 15 +++++++++++++++ - block/bfq-mq-iosched.c | 3 ++- - 2 files changed, 17 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 45fefb2e2d57..f94743fb2e7d 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -267,6 +267,8 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -+ trace_printk("bfqg %p\n", bfqg); -+ - #ifdef BFQ_MQ - bfqg->ref++; - #else -@@ -280,6 +282,9 @@ static void bfqg_put(struct bfq_group *bfqg) - bfqg->ref--; - - BUG_ON(bfqg->ref < 0); -+ trace_printk("putting bfqg %p %s\n", bfqg, -+ bfqg->ref == 0 ? "and freeing it" : ""); -+ - if (bfqg->ref == 0) - kfree(bfqg); - #else -@@ -293,6 +298,7 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) - /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ - bfqg_get(bfqg); - -+ trace_printk("getting blkg for bfqg %p\n", bfqg); - blkg_get(bfqg_to_blkg(bfqg)); - } - -@@ -300,6 +306,7 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - { - bfqg_put(bfqg); - -+ trace_printk("putting blkg for bfqg %p\n", bfqg); - blkg_put(bfqg_to_blkg(bfqg)); - } - #endif -@@ -382,6 +389,8 @@ static void bfq_init_entity(struct bfq_entity *entity, - * Make sure that bfqg and its associated blkg do not - * disappear before entity. - */ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); -+ - bfqg_and_blkg_get(bfqg); - #else - bfqg_get(bfqg); -@@ -475,6 +484,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - kfree(bfqg); - return NULL; - } -+ trace_printk("bfqg %p\n", bfqg); - - #ifdef BFQ_MQ - /* see comments in bfq_bic_update_cgroup for why refcounting */ -@@ -513,6 +523,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - static void bfq_pd_free(struct blkg_policy_data *pd) - { - struct bfq_group *bfqg = pd_to_bfqg(pd); -+ trace_printk("bfqg %p\n", bfqg); - - bfqg_stats_exit(&bfqg->stats); - #ifdef BFQ_MQ -@@ -650,6 +661,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } - #ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ - bfqg_and_blkg_put(bfqq_group(bfqq)); - #else - bfqg_put(bfqq_group(bfqq)); -@@ -658,6 +671,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - #ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); -+ - /* pin down bfqg and its associated blkg */ - bfqg_and_blkg_get(bfqg); - #else -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b82c52fabf91..d5b7a6b985d7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4385,10 +4385,11 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - -- kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); - bfqg_and_blkg_put(bfqg); - #endif -+ kmem_cache_free(bfq_pool, bfqq); - } - - static void bfq_put_cooperator(struct bfq_queue *bfqq) - -From 816b77fba966171974eb5ee25d81bc4e19eaf1b4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 10 Jan 2018 09:08:22 +0100 -Subject: [PATCH 15/23] bfq-sq, bfq-mq: compile group put for oom queue only if - BFQ_GROUP_IOSCHED is set - -Commit ("bfq-sq, bfq-mq: release oom-queue ref to root group on exit") -added a missing put of the root bfq group for the oom queue. That put -has to be, and can be, performed only if CONFIG_BFQ_GROUP_IOSCHED is -defined: the function doing the put is even not defined at all if -CONFIG_BFQ_GROUP_IOSCHED is not defined. But that commit makes that -put be invoked regardless of whether CONFIG_BFQ_GROUP_IOSCHED is -defined. This commit fixes this mistake, by making that invocation be -compiled only if CONFIG_BFQ_GROUP_IOSCHED is actually defined. - -Fixes ("block, bfq: release oom-queue ref to root group on exit") -Reported-by: Jan Alexander Steffens <jan.steffens@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index d5b7a6b985d7..2581fe0f6f2f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5508,10 +5508,10 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* release oom-queue reference to root group */ - bfqg_and_blkg_put(bfqd->root_group); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else - spin_lock_irq(&bfqd->lock); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 851af055664d..c4df156b1fb4 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5052,10 +5052,10 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* release oom-queue reference to root group */ - bfqg_put(bfqd->root_group); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); - -From 643a89c659172b2c9ae16adfe03af4e3e88e1326 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Sat, 13 Jan 2018 18:48:41 +0100 -Subject: [PATCH 16/23] block, bfq-sq, bfq-mq: remove trace_printks - -Commit ("block, bfq-sq, bfq-mq: trace get and put of bfq groups") -unwisely added some invocations of the function trace_printk, which -is inappropriate in production kernels. This commit removes those -invocations. - -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 10 ---------- - 1 file changed, 10 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index f94743fb2e7d..a4f8a03edfc9 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -267,8 +267,6 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -- trace_printk("bfqg %p\n", bfqg); -- - #ifdef BFQ_MQ - bfqg->ref++; - #else -@@ -282,9 +280,6 @@ static void bfqg_put(struct bfq_group *bfqg) - bfqg->ref--; - - BUG_ON(bfqg->ref < 0); -- trace_printk("putting bfqg %p %s\n", bfqg, -- bfqg->ref == 0 ? "and freeing it" : ""); -- - if (bfqg->ref == 0) - kfree(bfqg); - #else -@@ -298,7 +293,6 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) - /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ - bfqg_get(bfqg); - -- trace_printk("getting blkg for bfqg %p\n", bfqg); - blkg_get(bfqg_to_blkg(bfqg)); - } - -@@ -306,7 +300,6 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - { - bfqg_put(bfqg); - -- trace_printk("putting blkg for bfqg %p\n", bfqg); - blkg_put(bfqg_to_blkg(bfqg)); - } - #endif -@@ -484,8 +477,6 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - kfree(bfqg); - return NULL; - } -- trace_printk("bfqg %p\n", bfqg); -- - #ifdef BFQ_MQ - /* see comments in bfq_bic_update_cgroup for why refcounting */ - bfqg_get(bfqg); -@@ -523,7 +514,6 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - static void bfq_pd_free(struct blkg_policy_data *pd) - { - struct bfq_group *bfqg = pd_to_bfqg(pd); -- trace_printk("bfqg %p\n", bfqg); - - bfqg_stats_exit(&bfqg->stats); - #ifdef BFQ_MQ - -From ce050275e24fecec800f346c09d9494563e9fc8a Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Mon, 15 Jan 2018 15:07:05 +0100 -Subject: [PATCH 17/23] block, bfq-mq: add requeue-request hook - -Commit 'a6a252e64914 ("blk-mq-sched: decide how to handle flush rq via -RQF_FLUSH_SEQ")' makes all non-flush re-prepared requests for a device -be re-inserted into the active I/O scheduler for that device. As a -consequence, I/O schedulers may get the same request inserted again, -even several times, without a finish_request invoked on that request -before each re-insertion. - -This fact is the cause of the failure reported in [1]. For an I/O -scheduler, every re-insertion of the same re-prepared request is -equivalent to the insertion of a new request. For schedulers like -mq-deadline or kyber, this fact causes no harm. In contrast, it -confuses a stateful scheduler like BFQ, which keeps state for an I/O -request, until the finish_request hook is invoked on the request. In -particular, BFQ may get stuck, waiting forever for the number of -request dispatches, of the same request, to be balanced by an equal -number of request completions (while there will be one completion for -that request). In this state, BFQ may refuse to serve I/O requests -from other bfq_queues. The hang reported in [1] then follows. - -However, the above re-prepared requests undergo a requeue, thus the -requeue_request hook of the active elevator is invoked for these -requests, if set. This commit then addresses the above issue by -properly implementing the hook requeue_request in BFQ. - -[1] https://marc.info/?l=linux-block&m=151211117608676 - -Reported-by: Ivan Kozik <ivan@ludios.org> -Reported-by: Alban Browaeys <alban.browaeys@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> -Signed-off-by: Serena Ziviani <ziviani.serena@gmail.com> ---- - block/bfq-mq-iosched.c | 90 ++++++++++++++++++++++++++++++++++++++++---------- - 1 file changed, 73 insertions(+), 17 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 2581fe0f6f2f..bb7ccc2f1165 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4162,9 +4162,9 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * TESTING: reset DISP_LIST flag, because: 1) - * this rq this request has passed through - * bfq_prepare_request, 2) then it will have -- * bfq_finish_request invoked on it, and 3) in -- * bfq_finish_request we use this flag to check -- * that bfq_finish_request is not invoked on -+ * bfq_finish_requeue_request invoked on it, and 3) in -+ * bfq_finish_requeue_request we use this flag to check -+ * that bfq_finish_requeue_request is not invoked on - * requests for which bfq_prepare_request has - * been invoked. - */ -@@ -4173,8 +4173,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - } - - /* -- * We exploit the bfq_finish_request hook to decrement -- * rq_in_driver, but bfq_finish_request will not be -+ * We exploit the bfq_finish_requeue_request hook to decrement -+ * rq_in_driver, but bfq_finish_requeue_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, -@@ -4183,10 +4183,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * bfq_schedule_dispatch to be invoked uselessly. - * - * As for implementing an exact solution, the -- * bfq_finish_request hook, if defined, is probably -+ * bfq_finish_requeue_request hook, if defined, is probably - * invoked also on this request. So, by exploiting - * this hook, we could 1) increment rq_in_driver here, -- * and 2) decrement it in bfq_finish_request. Such a -+ * and 2) decrement it in bfq_finish_requeue_request. Such a - * solution would let the value of the counter be - * always accurate, but it would entail using an extra - * interface function. This cost seems higher than the -@@ -4878,6 +4878,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - return idle_timer_disabled; - } - -+static void bfq_prepare_request(struct request *rq, struct bio *bio); -+ - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) - { -@@ -4919,6 +4921,20 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - -+ if (!bfqq) { -+ /* -+ * This should never happen. Most likely rq is -+ * a requeued regular request, being -+ * re-inserted without being first -+ * re-prepared. Do a prepare, to avoid -+ * failure. -+ */ -+ pr_warn("Regular request associated with no queue"); -+ WARN_ON(1); -+ bfq_prepare_request(rq, rq->bio); -+ bfqq = RQ_BFQQ(rq); -+ } -+ - #if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* -@@ -5110,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - } - } - --static void bfq_finish_request_body(struct bfq_queue *bfqq) -+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "put_request_body: allocated %d", bfqq->allocated); -@@ -5120,7 +5136,13 @@ static void bfq_finish_request_body(struct bfq_queue *bfqq) - bfq_put_queue(bfqq); - } - --static void bfq_finish_request(struct request *rq) -+/* -+ * Handle either a requeue or a finish for rq. The things to do are -+ * the same in both cases: all references to rq are to be dropped. In -+ * particular, rq is considered completed from the point of view of -+ * the scheduler. -+ */ -+static void bfq_finish_requeue_request(struct request *rq) - { - struct bfq_queue *bfqq; - struct bfq_data *bfqd; -@@ -5128,11 +5150,27 @@ static void bfq_finish_request(struct request *rq) - - BUG_ON(!rq); - -- if (!rq->elv.icq) -+ bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * Requeue and finish hooks are invoked in blk-mq without -+ * checking whether the involved request is actually still -+ * referenced in the scheduler. To handle this fact, the -+ * following two checks make this function exit in case of -+ * spurious invocations, for which there is nothing to do. -+ * -+ * First, check whether rq has nothing to do with an elevator. -+ */ -+ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) - return; - -- bfqq = RQ_BFQQ(rq); -- BUG_ON(!bfqq); -+ /* -+ * rq either is not associated with any icq, or is an already -+ * requeued request that has not (yet) been re-inserted into -+ * a bfq_queue. -+ */ -+ if (!rq->elv.icq || !bfqq) -+ return; - - bic = RQ_BIC(rq); - BUG_ON(!bic); -@@ -5145,7 +5183,6 @@ static void bfq_finish_request(struct request *rq) - BUG(); - } - BUG_ON(rq->rq_flags & RQF_QUEUED); -- BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); - - bfq_log_bfqq(bfqd, bfqq, - "putting rq %p with %u sects left, STARTED %d", -@@ -5166,13 +5203,14 @@ static void bfq_finish_request(struct request *rq) - spin_lock_irqsave(&bfqd->lock, flags); - - bfq_completed_request(bfqq, bfqd); -- bfq_finish_request_body(bfqq); -+ bfq_finish_requeue_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -- * in which case we need to remove it. And we cannot -+ * in which case we need to remove it (this should -+ * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. -@@ -5189,9 +5227,26 @@ static void bfq_finish_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } -- bfq_finish_request_body(bfqq); -+ bfq_finish_requeue_request_body(bfqq); - } - -+ /* -+ * Reset private fields. In case of a requeue, this allows -+ * this function to correctly do nothing if it is spuriously -+ * invoked again on this same request (see the check at the -+ * beginning of the function). Probably, a better general -+ * design would be to prevent blk-mq from invoking the requeue -+ * or finish hooks of an elevator, for a request that is not -+ * referred by that elevator. -+ * -+ * Resetting the following fields would break the -+ * request-insertion logic if rq is re-inserted into a bfq -+ * internal queue, without a re-preparation. Here we assume -+ * that re-insertions of requeued requests, without -+ * re-preparation, can happen only for pass_through or at_head -+ * requests (which are not re-inserted into bfq internal -+ * queues). -+ */ - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - } -@@ -5960,7 +6015,8 @@ static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .limit_depth = bfq_limit_depth, - .prepare_request = bfq_prepare_request, -- .finish_request = bfq_finish_request, -+ .requeue_request = bfq_finish_requeue_request, -+ .finish_request = bfq_finish_requeue_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, - -From 3e4f292191cc62b3844316b9741534c3f1b36f0a Mon Sep 17 00:00:00 2001 -From: Davide Paganelli <paga.david@gmail.com> -Date: Thu, 8 Feb 2018 12:19:24 +0100 -Subject: [PATCH 18/23] block, bfq-mq, bfq-sq: make log functions print names - of calling functions - -Add the macro __func__ as a parameter to the invocations of the functions -pr_crit, blk_add_trace_msg and blk_add_cgroup_trace_msg in bfq_log* -functions, in order to include automatically in the log messages -the names of the functions that call the log functions. -The programmer can then avoid doing it. - -Signed-off-by: Davide Paganelli <paga.david@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-cgroup-included.c | 9 +-- - block/bfq-mq-iosched.c | 167 ++++++++++++++++++++++---------------------- - block/bfq-mq.h | 33 ++++----- - block/bfq-sched.c | 54 +++++++------- - block/bfq-sq-iosched.c | 134 +++++++++++++++++------------------ - block/bfq.h | 33 ++++----- - 6 files changed, 214 insertions(+), 216 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index a4f8a03edfc9..613f154e9da5 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -382,7 +382,8 @@ static void bfq_init_entity(struct bfq_entity *entity, - * Make sure that bfqg and its associated blkg do not - * disappear before entity. - */ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", -+ bfqg); - - bfqg_and_blkg_get(bfqg); - #else -@@ -651,7 +652,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } - #ifdef BFQ_MQ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); - - bfqg_and_blkg_put(bfqq_group(bfqq)); - #else -@@ -661,7 +662,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - #ifdef BFQ_MQ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); - - /* pin down bfqg and its associated blkg */ - bfqg_and_blkg_get(bfqg); -@@ -721,7 +722,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, -- "bic_change_group: %p %d", -+ "%p %d", - async_bfqq, - async_bfqq->ref); - bfq_put_queue(async_bfqq); -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bb7ccc2f1165..edc93b6af186 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -310,7 +310,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { -- bfq_log(bfqd, "schedule dispatch"); -+ bfq_log(bfqd, ""); - blk_mq_run_hw_queues(bfqd->queue, true); - } - } -@@ -489,8 +489,8 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; - -- bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", -- __func__, bfqd->wr_busy_queues, op_is_sync(op), -+ bfq_log(bfqd, "wr_busy %d sync %d depth %u", -+ bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); - } - -@@ -528,7 +528,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - if (rb_link) - *rb_link = p; - -- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ bfq_log(bfqd, "%llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - -@@ -749,7 +749,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; - } -@@ -842,7 +842,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; -- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", - new_budget); - bfq_requeue_bfqq(bfqd, bfqq, false); - } -@@ -915,8 +915,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - -@@ -929,11 +928,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfq_wr_duration(bfqd))) { - switch_back_to_interactive_wr(bfqq, bfqd); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching back to interactive"); -+ "switching back to interactive"); - } else { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching off wr (%lu + %lu < %lu)", -+ "switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - } -@@ -985,7 +984,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - -@@ -998,7 +997,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); - - /* - * We can now mark all queues in the burst list as -@@ -1170,7 +1169,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "handle_burst: late activation or different group"); -+ "late activation or different group"); - goto end; - } - -@@ -1180,7 +1179,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - goto end; - } -@@ -1686,7 +1685,7 @@ static void bfq_add_request(struct request *rq) - unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -@@ -1952,7 +1951,7 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; -- bfq_log(bfqd, "request_merge: req %p", __rq); -+ bfq_log(bfqd, "req %p", __rq); - - return ELEVATOR_FRONT_MERGE; - } -@@ -1989,7 +1988,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfqq->next_rq = next_rq; - - bfq_log_bfqq(bfqd, bfqq, -- "request_merged: req %p prev %p next_rq %p bfqq %p", -+ "req %p prev %p next_rq %p bfqq %p", - req, prev, next_rq, bfqq); - - /* -@@ -2018,7 +2017,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - goto end; - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "requests_merged: rq %p next %p bfqq %p next_bfqq %p", -+ "rq %p next %p bfqq %p next_bfqq %p", - rq, next, bfqq, next_bfqq); - - spin_lock_irq(&bfqq->bfqd->lock); -@@ -2069,10 +2068,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - */ - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "end_wr: wrais ending at %lu, rais_max_time %u", -+ "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); -- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", - bfqq->bfqd->wr_busy_queues); - } - -@@ -2245,8 +2244,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - { - if (bfq_too_late_for_merging(new_bfqq)) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] too late for bfq%d to be merged", -- __func__, new_bfqq->pid); -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); - return false; - } - -@@ -2395,8 +2394,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - } -@@ -2453,7 +2451,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - - } - -- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", - bfqd->wr_busy_queues); - - /* -@@ -2554,7 +2552,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ bfq_log_bfqq(bfqd, bfqq, "%u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); - } - -@@ -2620,10 +2618,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - - bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "set_in_service_queue, cur-budget = %d", -+ "cur-budget = %d", - bfqq->entity.budget); - } else -- bfq_log(bfqd, "set_in_service_queue: NULL"); -+ bfq_log(bfqd, "NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -2746,7 +2744,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, -- "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ "at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); - } -@@ -2766,7 +2764,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, -- "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ "only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } -@@ -2790,7 +2788,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, --"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20<<BFQ_RATE_SHIFT); -@@ -2805,14 +2803,14 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate <= bfqd->peak_rate) || - rate > 20<<BFQ_RATE_SHIFT) { - bfq_log(bfqd, -- "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ "goto reset, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, -- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ "do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -@@ -2868,7 +2866,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, -- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -@@ -2922,7 +2920,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - - if (bfqd->peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, -- "update_peak_rate: goto reset, samples %d", -+ "goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ -@@ -2943,7 +2941,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, --"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+"jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; -@@ -2969,7 +2967,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - - bfq_log(bfqd, -- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ "added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); -@@ -2985,12 +2983,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, -- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ "delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, -- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+ "samples at end %d", bfqd->peak_rate_samples); - } - - /* -@@ -3088,11 +3086,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - */ - budget = 2 * min_budget; - -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -@@ -3294,7 +3292,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - -- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ bfq_log(bfqd, "too short %u", delta_usecs); - - return slow; - } -@@ -3317,11 +3315,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfq_log(bfqd, "relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } - -- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); - - return slow; - } -@@ -3423,7 +3421,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqd, bfqq, --"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+"service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -@@ -3602,7 +3600,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "may_budget_timeout: wait_request %d left %d timeout %d", -+ "wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); -@@ -3863,11 +3861,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. - */ -- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, -- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ "wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), -@@ -3907,7 +3905,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - if (!bfqq) - goto new_queue; - -- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !bfq_bfqq_wait_request(bfqq) && -@@ -3983,14 +3981,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); - goto check_queue; - } - keep_queue: - if (bfqq) -- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); - else -- bfq_log(bfqd, "select_queue: no queue returned"); -+ bfq_log(bfqd, "no queue returned"); - - return bfqq; - } -@@ -4043,8 +4041,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* see comments on max_service_from_wr */ - bfq_bfqq_end_wr(bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "[%s] too much service", -- __func__); -+ "too much service"); - } - } - /* -@@ -4122,7 +4119,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - -- bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", -+ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", - !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); - - /* -@@ -4146,7 +4143,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq->rq_flags &= ~RQF_DISP_LIST; - - bfq_log(bfqd, -- "dispatch requests: picked %p from dispatch list", rq); -+ "picked %p from dispatch list", rq); - bfqq = RQ_BFQQ(rq); - - if (bfqq) { -@@ -4196,7 +4193,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - goto start_rq; - } - -- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) - goto exit; -@@ -4236,13 +4233,13 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq->rq_flags |= RQF_STARTED; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "dispatched %s request %p, rq_in_driver %d", -+ "%s request %p, rq_in_driver %d", - bfq_bfqq_sync(bfqq) ? "sync" : "async", - rq, - bfqd->rq_in_driver); - else - bfq_log(bfqd, -- "dispatched request %p from dispatch list, rq_in_driver %d", -+ "request %p from dispatch list, rq_in_driver %d", - rq, bfqd->rq_in_driver); - } else - bfq_log(bfqd, -@@ -4339,7 +4336,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->ref <= 0); - - if (bfqq->bfqd) -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); - - bfqq->ref--; - if (bfqq->ref) -@@ -4383,10 +4380,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - } - - if (bfqq->bfqd) -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); - bfqg_and_blkg_put(bfqg); - #endif - kmem_cache_free(bfq_pool, bfqq); -@@ -4418,7 +4415,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -@@ -4502,7 +4499,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "set_next_ioprio_data: bic_class %d prio %d class %d", -+ "bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - -@@ -4529,7 +4526,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -+ "bfqq %p %d", - bfqq, bfqq->ref); - } - -@@ -4667,14 +4664,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - * guarantee that this queue is not freed - * until its group goes away. - */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", - bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - - out: - bfqq->ref++; /* get a process reference to this queue */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); - return bfqq; - } -@@ -4733,7 +4730,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - -- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", - has_short_ttime); - - if (has_short_ttime) -@@ -4759,7 +4756,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ "has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -@@ -4818,7 +4815,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - - assert_spin_locked(&bfqd->lock); - -- bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); - - /* - * An unplug may trigger a requeue of a request from the device -@@ -4837,9 +4834,9 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - new_bfqq->allocated++; - bfqq->allocated--; - bfq_log_bfqq(bfqd, bfqq, -- "insert_request: new allocated %d", bfqq->allocated); -+ "new allocated %d", bfqq->allocated); - bfq_log_bfqq(bfqd, new_bfqq, -- "insert_request: new_bfqq new allocated %d", -+ "new_bfqq new allocated %d", - bfqq->allocated); - - new_bfqq->ref++; -@@ -4911,11 +4908,11 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - rq->rq_flags |= RQF_DISP_LIST; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "insert_request %p in disp: at_head %d", -+ "%p in disp: at_head %d", - rq, at_head); - else - bfq_log(bfqd, -- "insert_request %p in disp: at_head %d", -+ "%p in disp: at_head %d", - rq, at_head); - } else { - BUG_ON(!(rq->rq_flags & RQF_GOT)); -@@ -5033,7 +5030,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfqq->dispatched--; - - bfq_log_bfqq(bfqd, bfqq, -- "completed_requests: new disp %d, new rq_in_driver %d", -+ "new disp %d, new rq_in_driver %d", - bfqq->dispatched, bfqd->rq_in_driver); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -@@ -5061,7 +5058,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ "delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - delta_us > 0 ? - (USEC_PER_SEC* -@@ -5129,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "put_request_body: allocated %d", bfqq->allocated); -+ "allocated %d", bfqq->allocated); - BUG_ON(!bfqq->allocated); - bfqq->allocated--; - -@@ -5406,10 +5403,10 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) - - bfqq->allocated++; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "get_request: new allocated %d", bfqq->allocated); -+ "new allocated %d", bfqq->allocated); - - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -5493,7 +5490,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - idle_slice_timer); - struct bfq_queue *bfqq = bfqd->in_service_queue; - -- bfq_log(bfqd, "slice_timer expired"); -+ bfq_log(bfqd, "expired"); - - /* - * Theoretical race here: the in-service queue can be NULL or -@@ -5515,10 +5512,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - -- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ bfq_log(bfqd, "%p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); -- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; -@@ -5547,7 +5544,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - struct bfq_data *bfqd = e->elevator_data; - struct bfq_queue *bfqq, *n; - -- bfq_log(bfqd, "exit_queue: starting ..."); -+ bfq_log(bfqd, "starting ..."); - - hrtimer_cancel(&bfqd->idle_slice_timer); - -@@ -5575,7 +5572,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_unlock_irq(&bfqd->lock); - #endif - -- bfq_log(bfqd, "exit_queue: finished ..."); -+ bfq_log(bfqd, "finished ..."); - kfree(bfqd); - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 9a5ce1168ff5..e2ae11bf8f76 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -712,34 +712,34 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- bfqq_group(bfqq)->blkg_path, ##args); \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- pr_crit("%s %s " fmt "\n", \ -+ pr_crit("%s %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- bfqg->blkg_path, ##args); \ -+ bfqg->blkg_path, __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- pr_crit("%s bfq%d%c " fmt "\n", \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- pr_crit("%s bfq " fmt "\n", \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- ##args) -+ __func__, ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - -@@ -762,28 +762,29 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- bfqq_group(bfqq)->blkg_path, ##args); \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ -+ __func__, ##args);\ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) - - #endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -@@ -938,7 +939,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -946,7 +947,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - } - #endif -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 4e6c5232e2fb..ead34c30a7c2 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -119,7 +119,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_next_in_service: chose without lookup"); -+ "chose without lookup"); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -127,7 +127,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -- "update_next_in_service: chose without lookup"); -+ "chose without lookup"); - } - #endif - } -@@ -148,7 +148,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - bfqq = bfq_entity_to_bfqq(next_in_service); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_next_in_service: chosen this queue"); -+ "chosen this queue"); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -156,7 +156,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_next_in_service: chosen this entity"); -+ "chosen this entity"); - } - #endif - return parent_sched_may_change; -@@ -331,10 +331,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_finish: serv %lu, w %d", -+ "serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_finish: start %llu, finish %llu, delta %llu", -+ "start %llu, finish %llu, delta %llu", - start, finish, delta); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -342,10 +342,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_finish group: serv %lu, w %d", -+ "group: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_finish group: start %llu, finish %llu, delta %llu", -+ "group: start %llu, finish %llu, delta %llu", - start, finish, delta); - #endif - } -@@ -484,7 +484,7 @@ static void bfq_update_active_node(struct rb_node *node) - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_active_node: new min_start %llu", -+ "new min_start %llu", - ((entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -492,7 +492,7 @@ static void bfq_update_active_node(struct rb_node *node) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_active_node: new min_start %llu", -+ "new min_start %llu", - ((entity->min_start>>10)*1000)>>12); - #endif - } -@@ -620,7 +620,7 @@ static void bfq_get_entity(struct bfq_entity *entity) - - if (bfqq) { - bfqq->ref++; -- bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", - bfqq, bfqq->ref); - } - } -@@ -748,7 +748,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, - entity->on_st = false; - st->wsum -= entity->weight; - if (bfqq && !is_in_service) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -@@ -1008,7 +1008,7 @@ static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, - tot_serv_to_charge = entity->service; - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ "%lu/%u ms, %d/%d/%d sectors", - time_ms, timeout_ms, entity->service, - tot_serv_to_charge, entity->budget); - -@@ -1080,7 +1080,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_fin_time_enqueue: new queue finish %llu", -+ "new queue finish %llu", - ((entity->finish>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1088,7 +1088,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_fin_time_enqueue: new group finish %llu", -+ "new group finish %llu", - ((entity->finish>>10)*1000)>>12); - #endif - } -@@ -1098,7 +1098,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_fin_time_enqueue: queue %seligible in st %p", -+ "queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1106,7 +1106,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_fin_time_enqueue: group %seligible in st %p", -+ "group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #endif - } -@@ -1550,7 +1550,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_vtime_jump: new value %llu", -+ "new value %llu", - ((root_entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -1559,7 +1559,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_vtime_jump: new value %llu", -+ "new value %llu", - ((root_entity->min_start>>10)*1000)>>12); - } - #endif -@@ -1677,7 +1677,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__lookup_next: start %llu vtime %llu st %p", -+ "start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -1686,7 +1686,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__lookup_next: start %llu vtime %llu (%llu) st %p", -+ "start %llu vtime %llu (%llu) st %p", - ((entity->start>>10)*1000)>>12, - ((st->vtime>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); -@@ -1821,14 +1821,14 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, -- "get_next_queue: lookup in this group"); -+ "lookup in this group"); - if (!sd->next_in_service) -- pr_crit("get_next_queue: lookup in this group"); -+ pr_crit("lookup in this group"); - } else { - bfq_log_bfqg(bfqd, bfqd->root_group, -- "get_next_queue: lookup in root group"); -+ "lookup in root group"); - if (!sd->next_in_service) -- pr_crit("get_next_queue: lookup in root group"); -+ pr_crit("lookup in root group"); - } - #endif - -@@ -1903,7 +1903,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "get_next_queue: this queue, finish %llu", -+ "this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -1911,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, -- "get_next_queue: this entity, finish %llu", -+ "this entity, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - } - #endif -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index c4df156b1fb4..e49e8ac882b3 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -281,7 +281,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); - static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { -- bfq_log(bfqd, "schedule dispatch"); -+ bfq_log(bfqd, ""); - kblockd_schedule_work(&bfqd->unplug_work); - } - } -@@ -414,7 +414,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - if (rb_link) - *rb_link = p; - -- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ bfq_log(bfqd, "%llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - -@@ -635,7 +635,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; - } -@@ -728,7 +728,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; -- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", - new_budget); - bfq_requeue_bfqq(bfqd, bfqq, false); - } -@@ -800,8 +800,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - -@@ -814,11 +813,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfq_wr_duration(bfqd))) { - switch_back_to_interactive_wr(bfqq, bfqd); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching back to interactive"); -+ "switching back to interactive"); - } else { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching off wr (%lu + %lu < %lu)", -+ "switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - } -@@ -870,7 +869,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - -@@ -883,7 +882,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); - - /* - * We can now mark all queues in the burst list as -@@ -1055,7 +1054,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "handle_burst: late activation or different group"); -+ "late activation or different group"); - goto end; - } - -@@ -1065,7 +1064,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - goto end; - } -@@ -1572,7 +1571,7 @@ static void bfq_add_request(struct request *rq) - unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -@@ -1870,10 +1869,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - */ - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "end_wr: wrais ending at %lu, rais_max_time %u", -+ "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); -- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", - bfqq->bfqd->wr_busy_queues); - } - -@@ -2048,8 +2047,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - { - if (bfq_too_late_for_merging(new_bfqq)) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] too late for bfq%d to be merged", -- __func__, new_bfqq->pid); -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); - return false; - } - -@@ -2258,7 +2257,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - - } - -- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", - bfqd->wr_busy_queues); - - /* -@@ -2359,7 +2358,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ bfq_log_bfqq(bfqd, bfqq, "%u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); - } - -@@ -2427,10 +2426,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - - bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "set_in_service_queue, cur-budget = %d", -+ "cur-budget = %d", - bfqq->entity.budget); - } else -- bfq_log(bfqd, "set_in_service_queue: NULL"); -+ bfq_log(bfqd, "NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -2559,7 +2558,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, -- "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ "at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); - } -@@ -2579,7 +2578,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, -- "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ "only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } -@@ -2603,7 +2602,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, --"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20<<BFQ_RATE_SHIFT); -@@ -2618,14 +2617,14 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate <= bfqd->peak_rate) || - rate > 20<<BFQ_RATE_SHIFT) { - bfq_log(bfqd, -- "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", -+ "goto reset, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, -- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ "do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -@@ -2681,7 +2680,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, -- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -@@ -2735,7 +2734,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - - if (bfqd->peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, -- "update_peak_rate: goto reset, samples %d", -+ "goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ -@@ -2756,7 +2755,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, --"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+"jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; -@@ -2782,7 +2781,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - - bfq_log(bfqd, -- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ "added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); -@@ -2798,12 +2797,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, -- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ "delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, -- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+ "samples at end %d", bfqd->peak_rate_samples); - } - - /* -@@ -2900,11 +2899,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - */ - budget = 2 * min_budget; - -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -@@ -3106,7 +3105,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - -- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ bfq_log(bfqd, "too short %u", delta_usecs); - - return slow; - } -@@ -3129,11 +3128,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfq_log(bfqd, "relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } - -- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); - - return slow; - } -@@ -3235,7 +3234,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqd, bfqq, --"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+"service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -@@ -3414,7 +3413,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "may_budget_timeout: wait_request %d left %d timeout %d", -+ "wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); -@@ -3675,11 +3674,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. - */ -- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, -- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ "wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), -@@ -3719,7 +3718,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - if (!bfqq) - goto new_queue; - -- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !hrtimer_active(&bfqd->idle_slice_timer) && -@@ -3797,14 +3796,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); - goto check_queue; - } - keep_queue: - if (bfqq) -- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); - else -- bfq_log(bfqd, "select_queue: no queue returned"); -+ bfq_log(bfqd, "no queue returned"); - - return bfqq; - } -@@ -3857,8 +3856,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* see comments on max_service_from_wr */ - bfq_bfqq_end_wr(bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "[%s] too much service", -- __func__); -+ "too much service"); - } - } - /* -@@ -3987,7 +3985,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - -- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) - return 0; -@@ -4021,7 +4019,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; - -- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_log_bfqq(bfqd, bfqq, "%s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); - - BUG_ON(bfqq->next_rq == NULL && -@@ -4044,7 +4042,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); - bfqq->ref--; - if (bfqq->ref) - return; -@@ -4086,7 +4084,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfqq->bfqd->burst_size--; - } - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -4120,7 +4118,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -@@ -4200,7 +4198,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "set_next_ioprio_data: bic_class %d prio %d class %d", -+ "bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - -@@ -4227,7 +4225,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -+ "bfqq %p %d", - bfqq, bfqq->ref); - } - -@@ -4362,14 +4360,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - * guarantee that this queue is not freed - * until its group goes away. - */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", - bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - - out: - bfqq->ref++; /* get a process reference to this queue */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); - return bfqq; - } -@@ -4428,7 +4426,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - -- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", - has_short_ttime); - - if (has_short_ttime) -@@ -4454,7 +4452,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ "has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -@@ -4629,7 +4627,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - -- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - delta_us > 0 ? - (USEC_PER_SEC* -@@ -4750,7 +4748,7 @@ static void bfq_put_request(struct request *rq) - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -@@ -4816,7 +4814,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: was_in_list %d " -+ "was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, -@@ -4826,12 +4824,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: marking in " -+ "marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: clearing in " -+ "clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -@@ -4888,7 +4886,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - - bfqq->allocated[rw]++; - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -4962,7 +4960,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - * case we just expire a queue too early. - */ - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_log_bfqq(bfqd, bfqq, "expired"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfq_bfqq_budget_timeout(bfqq)) -@@ -5005,10 +5003,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - -- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ bfq_log(bfqd, "%p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); -- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; -diff --git a/block/bfq.h b/block/bfq.h -index 0cd7a3f251a7..4d2fe7f77af1 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -698,37 +698,37 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -- pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- pr_crit("%s %s " fmt "\n", \ -+ pr_crit("%s %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- pr_crit("%s bfq%d%c " fmt "\n", \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- pr_crit("%s bfq " fmt "\n", \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- ##args) -+ __func__, ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - -@@ -755,31 +755,32 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ -+ __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) - - #endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -@@ -928,7 +929,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -936,7 +937,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - } - #endif - -From 673a457e8a54d1c4b66e61b1a50956ba0b8c6a60 Mon Sep 17 00:00:00 2001 -From: Davide Paganelli <paga.david@gmail.com> -Date: Thu, 8 Feb 2018 11:49:58 +0100 -Subject: [PATCH 19/23] block, bfq-mq, bfq-sq: make bfq_bfqq_expire print - expiration reason - -Improve readability of the log messages related to the expiration -reasons of the function bfq_bfqq_expire. -Change the printing of the number that represents the reason for -expiration with an actual textual description of the reason. - -Signed-off-by: Davide Paganelli <paga.david@gmail.com> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 10 ++++++++-- - block/bfq-sq-iosched.c | 10 ++++++++-- - 2 files changed, 16 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index edc93b6af186..9268dd47a4e5 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -133,6 +133,12 @@ static const int bfq_timeout = (HZ / 8); - */ - static const unsigned long bfq_merge_time_limit = HZ/10; - -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -3553,8 +3559,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -- reason, slow, bfqq->dispatched, -+ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason_name[reason], slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e49e8ac882b3..f95deaab49a1 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -127,6 +127,12 @@ static const int bfq_timeout = (HZ / 8); - */ - static const unsigned long bfq_merge_time_limit = HZ/10; - -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -3366,8 +3372,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -- reason, slow, bfqq->dispatched, -+ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason_name[reason], slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* - -From 62e80623fbb58367c3f667dab22fea0804001f3b Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro <melzani.alessandro@gmail.com> -Date: Mon, 26 Feb 2018 22:21:59 +0100 -Subject: [PATCH 20/23] bfq-mq: port of "block, bfq: remove batches of - confusing ifdefs" - -Commit a33801e8b473 ("block, bfq: move debug blkio stats behind -CONFIG_DEBUG_BLK_CGROUP") introduced two batches of confusing ifdefs: -one reported in [1], plus a similar one in another function. This -commit removes both batches, in the way suggested in [1]. - -[1] https://www.spinics.net/lists/linux-block/msg20043.html - -Fixes: a33801e8b473 ("block, bfq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP") - -Signed-off-by: Alessandro Melzani <melzani.alessandro@gmail.com> ---- - block/bfq-mq-iosched.c | 128 ++++++++++++++++++++++++++++--------------------- - 1 file changed, 73 insertions(+), 55 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 9268dd47a4e5..5a211620f316 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4256,35 +4256,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - --static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) --{ -- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -- struct request *rq; --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- struct bfq_queue *in_serv_queue, *bfqq; -- bool waiting_rq, idle_timer_disabled; --#endif - -- spin_lock_irq(&bfqd->lock); -- --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- in_serv_queue = bfqd->in_service_queue; -- waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -- -- rq = __bfq_dispatch_request(hctx); -- -- idle_timer_disabled = -- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -- --#else -- rq = __bfq_dispatch_request(hctx); --#endif -- spin_unlock_irq(&bfqd->lock); -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) -+{ -+ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; - --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- bfqq = rq ? RQ_BFQQ(rq) : NULL; - if (!idle_timer_disabled && !bfqq) -- return rq; -+ return; - - /* - * rq and bfqq are guaranteed to exist until this function -@@ -4299,7 +4281,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * In addition, the following queue lock guarantees that - * bfqq_group(bfqq) exists as well. - */ -- spin_lock_irq(hctx->queue->queue_lock); -+ spin_lock_irq(q->queue_lock); - if (idle_timer_disabled) - /* - * Since the idle timer has been disabled, -@@ -4318,8 +4300,35 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - bfqg_stats_set_start_empty_time(bfqg); - bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); - } -- spin_unlock_irq(hctx->queue->queue_lock); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) {} - #endif -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ struct bfq_queue *in_serv_queue; -+ bool waiting_rq, idle_timer_disabled; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ -+ rq = __bfq_dispatch_request(hctx); -+ -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, -+ idle_timer_disabled); - - return rq; - } -@@ -4881,6 +4890,38 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - return idle_timer_disabled; - } - -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) -+{ -+ if (!bfqq) -+ return; -+ -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instructions. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) {} -+#endif -+ - static void bfq_prepare_request(struct request *rq, struct bio *bio); - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -@@ -4889,10 +4930,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bool idle_timer_disabled = false; - unsigned int cmd_flags; --#endif - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4938,7 +4977,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bfqq = RQ_BFQQ(rq); - } - --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred -@@ -4946,9 +4984,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - * redirected into a new queue. - */ - bfqq = RQ_BFQQ(rq); --#else -- __bfq_insert_request(bfqd, rq); --#endif - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4956,34 +4991,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ - /* - * Cache cmd_flags before releasing scheduler lock, because rq - * may disappear afterwards (for example, because of a request - * merge). - */ - cmd_flags = rq->cmd_flags; --#endif -+ - spin_unlock_irq(&bfqd->lock); --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- if (!bfqq) -- return; -- /* -- * bfqq still exists, because it can disappear only after -- * either it is merged with another queue, or the process it -- * is associated with exits. But both actions must be taken by -- * the same process currently executing this flow of -- * instruction. -- * -- * In addition, the following queue lock guarantees that -- * bfqq_group(bfqq) exists as well. -- */ -- spin_lock_irq(q->queue_lock); -- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -- if (idle_timer_disabled) -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); -- spin_unlock_irq(q->queue_lock); --#endif -+ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, -+ cmd_flags); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, - -From 0d0d05632872b226f4fae5e56af8736a4c24bf57 Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro <melzani.alessandro@gmail.com> -Date: Mon, 26 Feb 2018 22:43:30 +0100 -Subject: [PATCH 21/23] bfq-sq, bfq-mq: port of "bfq: Use icq_to_bic() - consistently" - -Some code uses icq_to_bic() to convert an io_cq pointer to a -bfq_io_cq pointer while other code uses a direct cast. Convert -the code that uses a direct cast such that it uses icq_to_bic(). - -Signed-off-by: Alessandro Melzani <melzani.alessandro@gmail.com> ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 5a211620f316..7b1269558c47 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -272,7 +272,7 @@ static const unsigned long max_service_from_wr = 120000; - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - --#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - - /** -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index f95deaab49a1..c4aff8d55fc4 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -266,7 +266,7 @@ static const unsigned long max_service_from_wr = 120000; - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - --#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - - static void bfq_schedule_dispatch(struct bfq_data *bfqd); - -From 4cb5de6add7d6ad0d25d73cb95dc871305db1522 Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro <melzani.alessandro@gmail.com> -Date: Mon, 26 Feb 2018 22:59:30 +0100 -Subject: [PATCH 22/23] bfq-sq, bfq-mq: port of "block, bfq: fix error handle - in bfq_init" - -if elv_register fail, bfq_pool should be free. - -Signed-off-by: Alessandro Melzani <melzani.alessandro@gmail.com> ---- - block/bfq-mq-iosched.c | 4 +++- - block/bfq-sq-iosched.c | 4 +++- - 2 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 7b1269558c47..964e88c2ce59 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -6129,7 +6129,7 @@ static int __init bfq_init(void) - - ret = elv_register(&iosched_bfq_mq); - if (ret) -- goto err_pol_unreg; -+ goto slab_kill; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); -@@ -6138,6 +6138,8 @@ static int __init bfq_init(void) - - return 0; - -+slab_kill: -+ bfq_slab_kill(); - err_pol_unreg: - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index c4aff8d55fc4..7f0cf1f01ffc 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5590,7 +5590,7 @@ static int __init bfq_init(void) - - ret = elv_register(&iosched_bfq); - if (ret) -- goto err_pol_unreg; -+ goto slab_kill; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); -@@ -5599,6 +5599,8 @@ static int __init bfq_init(void) - - return 0; - -+slab_kill: -+ bfq_slab_kill(); - err_pol_unreg: - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - -From 1f77c173aaa87ffb22c9f062a6449245d14311e4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente <paolo.valente@linaro.org> -Date: Wed, 4 Apr 2018 11:28:16 +0200 -Subject: [PATCH 23/23] block, bfq-sq, bfq-mq: lower-bound the estimated peak - rate to 1 - -If a storage device handled by BFQ happens to be slower than 7.5 KB/s -for a certain amount of time (in the order of a second), then the -estimated peak rate of the device, maintained in BFQ, becomes equal to -0. The reason is the limited precision with which the rate is -represented (details on the range of representable values in the -comments introduced by this commit). This leads to a division-by-zero -error where the estimated peak rate is used as divisor. Such a type of -failure has been reported in [1]. - -This commit addresses this issue by: -1. Lower-bounding the estimated peak rate to 1 -2. Adding and improving comments on the range of rates representable - -[1] https://www.spinics.net/lists/kernel/msg2739205.html - -Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> -Signed-off-by: Paolo Valente <paolo.valente@linaro.org> ---- - block/bfq-mq-iosched.c | 25 ++++++++++++++++++++++++- - block/bfq-mq.h | 7 ++++++- - block/bfq-sq-iosched.c | 25 ++++++++++++++++++++++++- - block/bfq.h | 7 ++++++- - 4 files changed, 60 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 964e88c2ce59..03efd90c5d20 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -160,7 +160,20 @@ static struct kmem_cache *bfq_pool; - /* Target observation time interval for a peak-rate update (ns) */ - #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - --/* Shift used for peak rate fixed precision calculations. */ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ - #define BFQ_RATE_SHIFT 16 - - /* -@@ -2881,6 +2894,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - - bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index e2ae11bf8f76..4a54e5076863 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -490,7 +490,12 @@ struct bfq_data { - u32 last_rq_max_size; - /* time elapsed from first dispatch in current observ. interval (us) */ - u64 delta_from_first; -- /* current estimate of device peak rate */ -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ - u32 peak_rate; - - /* maximum budget allotted to a bfq_queue before rescheduling */ -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 7f0cf1f01ffc..e96213865fc2 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -154,7 +154,20 @@ static struct kmem_cache *bfq_pool; - /* Target observation time interval for a peak-rate update (ns) */ - #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - --/* Shift used for peak rate fixed precision calculations. */ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ - #define BFQ_RATE_SHIFT 16 - - /* -@@ -2695,6 +2708,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - - bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); - -diff --git a/block/bfq.h b/block/bfq.h -index 4d2fe7f77af1..a25e76c906d9 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -498,7 +498,12 @@ struct bfq_data { - u32 last_rq_max_size; - /* time elapsed from first dispatch in current observ. interval (us) */ - u64 delta_from_first; -- /* current estimate of device peak rate */ -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ - u32 peak_rate; - - /* maximum budget allotted to a bfq_queue before rescheduling */ diff --git a/sys-kernel/linux-image-redcore-lts/files/4.14-redcore-lts-amd64.config b/sys-kernel/linux-sources-redcore-lts/files/4.14-amd64.config index 23e35863..307b0bd9 100644 --- a/sys-kernel/linux-image-redcore-lts/files/4.14-redcore-lts-amd64.config +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-amd64.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.14.90-redcore-lts Kernel Configuration +# Linux/x86 4.14.95-redcore-lts-r1 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -432,15 +432,10 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y CONFIG_CFQ_GROUP_IOSCHED=y -CONFIG_IOSCHED_BFQ_SQ=y -CONFIG_BFQ_SQ_GROUP_IOSCHED=y # CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_BFQ_SQ=y +CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq-sq" -CONFIG_MQ_IOSCHED_BFQ=y -CONFIG_MQ_BFQ_GROUP_IOSCHED=y +CONFIG_DEFAULT_IOSCHED="cfq" CONFIG_MQ_IOSCHED_DEADLINE=y # CONFIG_MQ_IOSCHED_KYBER is not set CONFIG_IOSCHED_BFQ=y diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-redcore-lts-amd64.config b/sys-kernel/linux-sources-redcore-lts/files/4.19-amd64.config index c5bedf65..f0565a81 100644 --- a/sys-kernel/linux-sources-redcore-lts/files/4.19-redcore-lts-amd64.config +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-amd64.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.19.20-redcore-lts Kernel Configuration +# Linux/x86 4.19.20-redcore-lts-r1 Kernel Configuration # # @@ -937,15 +937,10 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y CONFIG_CFQ_GROUP_IOSCHED=y -CONFIG_IOSCHED_BFQ_SQ=y -CONFIG_BFQ_SQ_GROUP_IOSCHED=y # CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_BFQ_SQ=y +CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq-sq" -CONFIG_MQ_IOSCHED_BFQ=y -CONFIG_MQ_BFQ_GROUP_IOSCHED=y +CONFIG_DEFAULT_IOSCHED="cfq" CONFIG_MQ_IOSCHED_DEADLINE=y # CONFIG_MQ_IOSCHED_KYBER is not set CONFIG_IOSCHED_BFQ=y @@ -7564,7 +7559,7 @@ CONFIG_GREYBUS_SDIO=m CONFIG_GREYBUS_SPI=m CONFIG_GREYBUS_UART=m CONFIG_GREYBUS_USB=m -# CONFIG_DRM_VBOXVIDEO is not set +CONFIG_DRM_VBOXVIDEO=m CONFIG_PI433=m CONFIG_MTK_MMC=m # CONFIG_MTK_AEE_KDUMP is not set diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch deleted file mode 100644 index 039c8fcd..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch +++ /dev/null @@ -1,18511 +0,0 @@ -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 8d8d8f06cab2..41d0200944f1 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -1,3 +1,6 @@ -+[ THIS TREE CONTAINS ALSO THE DEV VERSION OF BFQ. -+ DETAILS AT THE END OF THIS DOCUMENT. ] -+ - BFQ (Budget Fair Queueing) - ========================== - -@@ -11,6 +14,15 @@ controllers), BFQ's main features are: - groups (switching back to time distribution when needed to keep - throughput high). - -+If bfq-mq patches have been applied, then the following three -+instances of BFQ are available (otherwise only the first instance): -+- bfq: mainline version of BFQ, for blk-mq -+- bfq-mq: development version of BFQ for blk-mq; this version contains -+ also all latest features and fixes not yet landed in mainline, plus many -+ safety checks -+- bfq-sq: BFQ for legacy blk; also this version contains latest features -+ and fixes, as well as safety checks -+ - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - schedules that may lead to a lower throughput. If your main or only -@@ -22,27 +34,42 @@ latency and throughput, or on how to maximize throughput. - - BFQ has a non-null overhead, which limits the maximum IOPS that a CPU - can process for a device scheduled with BFQ. To give an idea of the --limits on slow or average CPUs, here are, first, the limits of BFQ for --three different CPUs, on, respectively, an average laptop, an old --desktop, and a cheap embedded system, in case full hierarchical --support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but -+limits on slow or average CPUs, here are, first, the limits of bfq-mq -+and bfq for three different CPUs, on, respectively, an average laptop, -+an old desktop, and a cheap embedded system, in case full hierarchical -+support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for -+bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but - CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): - - Intel i7-4850HQ: 400 KIOPS - - AMD A8-3850: 250 KIOPS - - ARM CortexTM-A53 Octa-core: 80 KIOPS - --If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical --support is enabled), then the sustainable throughput with BFQ --decreases, because all blkio.bfq* statistics are created and updated --(Section 4-2). For BFQ, this leads to the following maximum --sustainable throughputs, on the same systems as above: -+As for bfq-sq, it cannot reach the above IOPS, because of the -+inherent, lower parallelism of legacy blk and of the components within -+it (including bfq-sq itself). In particular, results with -+CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits -+reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however -+provide a lower bound to the limits of bfq-sq. -+ -+Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and -+of course full hierarchical support is enabled), then the sustainable -+throughput with bfq-mq and bfq decreases, because all blkio.bfq* -+statistics are created and updated (Section 4-2). For bfq-mq and bfq, -+this leads to the following maximum sustainable throughputs, on the -+same systems as above: - - Intel i7-4850HQ: 310 KIOPS - - AMD A8-3850: 200 KIOPS - - ARM CortexTM-A53 Octa-core: 56 KIOPS - --BFQ works for multi-queue devices too. -+Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical -+support is enabled), then bfq-sq exhibits the following limits: -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS - --The table of contents follow. Impatients can just jump to Section 3. -+BFQ works for multi-queue devices too (bfq and bfq-mq instances). -+ -+The table of contents follows. Impatients can just jump to Section 3. - - CONTENTS - -@@ -509,25 +536,27 @@ To get proportional sharing of bandwidth with BFQ for a given device, - BFQ must of course be the active scheduler for that device. - - Within each group directory, the names of the files associated with --BFQ-specific cgroup parameters and stats begin with the "bfq." --prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for --BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group --parameter to set the weight of a group with BFQ is blkio.bfq.weight -+BFQ-specific cgroup parameters and stats begin with the "bfq.", -+"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you -+want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for -+BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" -+(i.e., null string), "-sq" or "-mq". For example, the group parameter -+to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - - As for cgroups-v1 (blkio controller), the exact set of stat files --created, and kept up-to-date by bfq, depends on whether --CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all -+created, and kept up-to-date by bfq*, depends on whether -+CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all - the stat files documented in - Documentation/cgroup-v1/blkio-controller.txt. If, instead, --CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files --blkio.bfq.io_service_bytes --blkio.bfq.io_service_bytes_recursive --blkio.bfq.io_serviced --blkio.bfq.io_serviced_recursive -+CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files -+blkio.bfq*.io_service_bytes -+blkio.bfq*.io_service_bytes_recursive -+blkio.bfq*.io_serviced -+blkio.bfq*.io_serviced_recursive - - The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum --throughput sustainable with bfq, because updating the blkio.bfq.* -+throughput sustainable with bfq*, because updating the blkio.bfq* - stats is rather costly, especially for some of the stats enabled by - CONFIG_DEBUG_BLK_CGROUP. - -@@ -536,7 +565,7 @@ Parameters to set - - For each group, there is only the following parameter to set. - --weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -+weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the - group inside its parent. Available values: 1..10000 (default 100). The - linear mapping between ioprio and weights, described at the beginning - of the tunable section, is still valid, but all weights higher than -@@ -559,3 +588,55 @@ applications. Unset this tunable if you need/want to control weights. - Slightly extended version: - http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- - results.pdf -+ -+---------------------------------------------------------------------- -+ -+DETAILS ON THE DEV VERSIONS IN THIS TREE -+ -+The dev version of BFQ is available for both the legacy and the -+multi-queue block layers, as two additional I/O schedulers, named, -+respectively, bfq-sq-iosched and bfq-mq-iosched (the latter is -+available if also the changes introducing bfq-mq-iosched have been -+applied). In particular, this tree contains the dev version of bfq for -+Linux mainline 4.19.0, and has been obtained from the dev version for -+Linux 4.18.0. Rebasing from 4.18 to 4.19 involved two manual -+interventions. -+ -+First, some conflicts had to be resolved, as follows: -+ -+--------------------------------------------------------------- -+ -+diff --cc Makefile -+index 7727c1bf6fa5,69fa5c0310d8..c7cbdf0ad594 -+--- a/Makefile -++++ b/Makefile -+@@@ -1,9 -1,9 +1,9 @@@ -+ # SPDX-License-Identifier: GPL-2.0 -+ VERSION = 4 -+- PATCHLEVEL = 18 -++ PATCHLEVEL = 19 -+ SUBLEVEL = 0 -+ -EXTRAVERSION = -+ +EXTRAVERSION = -bfq-mq -+- NAME = Merciless Moray -++ NAME = "People's Front" -+ -+ # *DOCUMENTATION* -+ # To see a list of typical targets execute "make help" -+diff --cc include/linux/blkdev.h -+index 897c63322bd7,6980014357d4..8c4568ea6884 -+--- a/include/linux/blkdev.h -++++ b/include/linux/blkdev.h -+@@@ -56,7 -54,7 +54,7 @@@ struct blk_stat_callback -+ * Maximum number of blkcg policies allowed to be registered concurrently. -+ * Defined here to simplify include dependency. -+ */ -+--#define BLKCG_MAX_POLS 5 -+++#define BLKCG_MAX_POLS 7 -+ -+ typedef void (rq_end_io_fn)(struct request *, blk_status_t); -+ -+--------------------------------------------------------------- -+ -+Second, the following port commit had to be made: -+port commit "block: use ktime_get_ns() instead of sched_clock() for cfq and bfq" -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index e32fc1f274d8..94cb28eb20ba 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -12,6 +12,11 @@ CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_LOG_BUF_SHIFT=18 - CONFIG_CGROUPS=y -+CONFIG_BLK_CGROUP=y -+CONFIG_IOSCHED_BFQ_SQ=y -+CONFIG_BFQ_SQ_GROUP_IOSCHED=y -+CONFIG_MQ_IOSCHED_BFQ=y -+CONFIG_MQ_BFQ_GROUP_IOSCHED=y - CONFIG_CGROUP_FREEZER=y - CONFIG_CPUSETS=y - CONFIG_CGROUP_CPUACCT=y -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index a4a8914bf7a4..299a6861fb90 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED - ---help--- - Enable group IO scheduling in CFQ. - -+config IOSCHED_BFQ_SQ -+ tristate "BFQ-SQ I/O scheduler" -+ default n -+ ---help--- -+ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for -+ SingleQueue) distributes bandwidth among all processes -+ according to their weights, regardless of the device -+ parameters and with any workload. It also guarantees a low -+ latency to interactive and soft real-time applications. -+ Details in Documentation/block/bfq-iosched.txt -+ -+config BFQ_SQ_GROUP_IOSCHED -+ bool "BFQ-SQ hierarchical scheduling support" -+ depends on IOSCHED_BFQ_SQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-SQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - choice - - prompt "Default I/O scheduler" -@@ -54,6 +74,16 @@ choice - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - -+ config DEFAULT_BFQ_SQ -+ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y -+ help -+ Selects BFQ-SQ as the default I/O scheduler which will be -+ used by default for all block devices. -+ The BFQ-SQ I/O scheduler aims at distributing the bandwidth -+ as desired, independently of the disk parameters and with -+ any workload. It also tries to guarantee low latency to -+ interactive and soft real-time applications. -+ - config DEFAULT_NOOP - bool "No-op" - -@@ -63,8 +93,28 @@ config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ -+ default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - -+config MQ_IOSCHED_BFQ -+ tristate "BFQ-MQ I/O Scheduler" -+ default y -+ ---help--- -+ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth -+ among all processes according to their weights, regardless of -+ the device parameters and with any workload. It also -+ guarantees a low latency to interactive and soft real-time -+ applications. Details in Documentation/block/bfq-iosched.txt -+ -+config MQ_BFQ_GROUP_IOSCHED -+ bool "BFQ-MQ hierarchical scheduling support" -+ depends on MQ_IOSCHED_BFQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-MQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/Makefile b/block/Makefile -index 572b33f32c07..1dd6ffdc2fee 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -25,6 +25,8 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o - obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -+obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o -+obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -new file mode 100644 -index 000000000000..15459e50cd6a ---- /dev/null -+++ b/block/bfq-cgroup-included.c -@@ -0,0 +1,1359 @@ -+/* -+ * BFQ: CGROUPS support. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ */ -+ -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ -+/* bfqg stats flags */ -+enum bfqg_stats_flags { -+ BFQG_stats_waiting = 0, -+ BFQG_stats_idling, -+ BFQG_stats_empty, -+}; -+ -+#define BFQG_FLAG_FNS(name) \ -+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags |= (1 << BFQG_stats_##name); \ -+} \ -+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags &= ~(1 << BFQG_stats_##name); \ -+} \ -+static int bfqg_stats_##name(struct bfqg_stats *stats) \ -+{ \ -+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -+} \ -+ -+BFQG_FLAG_FNS(waiting) -+BFQG_FLAG_FNS(idling) -+BFQG_FLAG_FNS(empty) -+#undef BFQG_FLAG_FNS -+ -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else -+/* This should be called with the queue_lock held. */ -+#endif -+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -+{ -+ u64 now; -+ -+ if (!bfqg_stats_waiting(stats)) -+ return; -+ -+ now = ktime_get_ns(); -+ if (now > stats->start_group_wait_time) -+ blkg_stat_add(&stats->group_wait_time, -+ now - stats->start_group_wait_time); -+ bfqg_stats_clear_waiting(stats); -+} -+ -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else -+/* This should be called with the queue_lock held. */ -+#endif -+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_waiting(stats)) -+ return; -+ if (bfqg == curr_bfqg) -+ return; -+ stats->start_group_wait_time = ktime_get_ns(); -+ bfqg_stats_mark_waiting(stats); -+} -+ -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else -+/* This should be called with the queue_lock held. */ -+#endif -+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -+{ -+ u64 now; -+ -+ if (!bfqg_stats_empty(stats)) -+ return; -+ -+ now = ktime_get_ns(); -+ if (now > stats->start_empty_time) -+ blkg_stat_add(&stats->empty_time, -+ now - stats->start_empty_time); -+ bfqg_stats_clear_empty(stats); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -+{ -+ blkg_stat_add(&bfqg->stats.dequeue, 1); -+} -+ -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (blkg_rwstat_total(&stats->queued)) -+ return; -+ -+ /* -+ * group is already marked empty. This can happen if bfqq got new -+ * request in parent group and moved to this group while being added -+ * to service tree. Just ignore the event and move on. -+ */ -+ if (bfqg_stats_empty(stats)) -+ return; -+ -+ stats->start_empty_time = ktime_get_ns(); -+ bfqg_stats_mark_empty(stats); -+} -+ -+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_idling(stats)) { -+ u64 now = ktime_get_ns(); -+ -+ if (now > stats->start_idle_time) -+ blkg_stat_add(&stats->idle_time, -+ now - stats->start_idle_time); -+ bfqg_stats_clear_idling(stats); -+ } -+} -+ -+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ stats->start_idle_time = ktime_get_ns(); -+ bfqg_stats_mark_idling(stats); -+} -+ -+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ blkg_stat_add(&stats->avg_queue_size_sum, -+ blkg_rwstat_total(&stats->queued)); -+ blkg_stat_add(&stats->avg_queue_size_samples, 1); -+ bfqg_stats_update_group_wait_time(stats); -+} -+ -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, -+ unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ u64 start_time_ns, -+ u64 io_start_time_ns, -+ unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ u64 now = ktime_get_ns(); -+ -+ if (now > io_start_time_ns) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time_ns); -+ if (io_start_time_ns > start_time_ns) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time_ns - start_time_ns); -+} -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ u64 start_time_ns, -+ u64 io_start_time_ns, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct blkcg_policy blkcg_policy_bfq; -+ -+/* -+ * blk-cgroup policy-related handlers -+ * The following functions help in converting between blk-cgroup -+ * internal structures and BFQ-specific structures. -+ */ -+ -+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -+{ -+ return pd ? container_of(pd, struct bfq_group, pd) : NULL; -+} -+ -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -+{ -+ return pd_to_blkg(&bfqg->pd); -+} -+ -+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -+{ -+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); -+ -+ return pd_to_bfqg(pd); -+} -+ -+/* -+ * bfq_group handlers -+ * The following functions help in navigating the bfq_group hierarchy -+ * by allowing to find the parent of a bfq_group or the bfq_group -+ * associated to a bfq_queue. -+ */ -+ -+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -+{ -+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; -+ -+ return pblkg ? blkg_to_bfqg(pblkg) : NULL; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ return group_entity ? container_of(group_entity, struct bfq_group, -+ entity) : -+ bfqq->bfqd->root_group; -+} -+ -+/* -+ * The following two functions handle get and put of a bfq_group by -+ * wrapping the related blk-cgroup hooks. -+ */ -+ -+static void bfqg_get(struct bfq_group *bfqg) -+{ -+#ifdef BFQ_MQ -+ bfqg->ref++; -+#else -+ blkg_get(bfqg_to_blkg(bfqg)); -+#endif -+} -+ -+static void bfqg_put(struct bfq_group *bfqg) -+{ -+#ifdef BFQ_MQ -+ bfqg->ref--; -+ -+ BUG_ON(bfqg->ref < 0); -+ if (bfqg->ref == 0) -+ kfree(bfqg); -+#else -+ blkg_put(bfqg_to_blkg(bfqg)); -+#endif -+} -+ -+#ifdef BFQ_MQ -+static void bfqg_and_blkg_get(struct bfq_group *bfqg) -+{ -+ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ -+ bfqg_get(bfqg); -+ -+ blkg_get(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_and_blkg_put(struct bfq_group *bfqg) -+{ -+ blkg_put(bfqg_to_blkg(bfqg)); -+ -+ bfqg_put(bfqg); -+} -+#endif -+ -+/* @stats = 0 */ -+static void bfqg_stats_reset(struct bfqg_stats *stats) -+{ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_reset(&stats->merged); -+ blkg_rwstat_reset(&stats->service_time); -+ blkg_rwstat_reset(&stats->wait_time); -+ blkg_stat_reset(&stats->time); -+ blkg_stat_reset(&stats->avg_queue_size_sum); -+ blkg_stat_reset(&stats->avg_queue_size_samples); -+ blkg_stat_reset(&stats->dequeue); -+ blkg_stat_reset(&stats->group_wait_time); -+ blkg_stat_reset(&stats->idle_time); -+ blkg_stat_reset(&stats->empty_time); -+#endif -+} -+ -+/* @to += @from */ -+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) -+{ -+ if (!to || !from) -+ return; -+ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_add_aux(&to->merged, &from->merged); -+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); -+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); -+ blkg_stat_add_aux(&from->time, &from->time); -+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); -+ blkg_stat_add_aux(&to->avg_queue_size_samples, -+ &from->avg_queue_size_samples); -+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); -+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); -+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); -+ blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+#endif -+} -+ -+/* -+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' -+ * recursive stats can still account for the amount used by this bfqg after -+ * it's gone. -+ */ -+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -+{ -+ struct bfq_group *parent; -+ -+ if (!bfqg) /* root_group */ -+ return; -+ -+ parent = bfqg_parent(bfqg); -+ -+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); -+ -+ if (unlikely(!parent)) -+ return; -+ -+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+#ifdef BFQ_MQ -+ /* -+ * Make sure that bfqg and its associated blkg do not -+ * disappear before entity. -+ */ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", -+ bfqg); -+ -+ bfqg_and_blkg_get(bfqg); -+#else -+ bfqg_get(bfqg); -+#endif -+ } -+ entity->parent = bfqg->my_entity; /* NULL for root group */ -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfqg_stats_exit(struct bfqg_stats *stats) -+{ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ blkg_rwstat_exit(&stats->merged); -+ blkg_rwstat_exit(&stats->service_time); -+ blkg_rwstat_exit(&stats->wait_time); -+ blkg_rwstat_exit(&stats->queued); -+ blkg_stat_exit(&stats->time); -+ blkg_stat_exit(&stats->avg_queue_size_sum); -+ blkg_stat_exit(&stats->avg_queue_size_samples); -+ blkg_stat_exit(&stats->dequeue); -+ blkg_stat_exit(&stats->group_wait_time); -+ blkg_stat_exit(&stats->idle_time); -+ blkg_stat_exit(&stats->empty_time); -+#endif -+} -+ -+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -+{ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ if (blkg_rwstat_init(&stats->merged, gfp) || -+ blkg_rwstat_init(&stats->service_time, gfp) || -+ blkg_rwstat_init(&stats->wait_time, gfp) || -+ blkg_rwstat_init(&stats->queued, gfp) || -+ blkg_stat_init(&stats->time, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || -+ blkg_stat_init(&stats->dequeue, gfp) || -+ blkg_stat_init(&stats->group_wait_time, gfp) || -+ blkg_stat_init(&stats->idle_time, gfp) || -+ blkg_stat_init(&stats->empty_time, gfp)) { -+ bfqg_stats_exit(stats); -+ return -ENOMEM; -+ } -+#endif -+ -+ return 0; -+} -+ -+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -+{ -+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -+} -+ -+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -+{ -+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -+} -+ -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), gfp); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ -+static void bfq_cpd_init(struct blkcg_policy_data *cpd) -+{ -+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); -+ -+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? -+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); -+} -+ -+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -+{ -+ struct bfq_group *bfqg; -+ -+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); -+ if (!bfqg) -+ return NULL; -+ -+ if (bfqg_stats_init(&bfqg->stats, gfp)) { -+ kfree(bfqg); -+ return NULL; -+ } -+#ifdef BFQ_MQ -+ /* see comments in bfq_bic_update_cgroup for why refcounting */ -+ bfqg_get(bfqg); -+#endif -+ return &bfqg->pd; -+} -+ -+static void bfq_pd_init(struct blkg_policy_data *pd) -+{ -+ struct blkcg_gq *blkg; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ struct bfq_group_data *d; -+ -+ blkg = pd_to_blkg(pd); -+ BUG_ON(!blkg); -+ bfqg = blkg_to_bfqg(blkg); -+ bfqd = blkg->q->elevator->elevator_data; -+ BUG_ON(bfqg == bfqd->root_group); -+ entity = &bfqg->entity; -+ d = blkcg_to_bfqgd(blkg->blkcg); -+ -+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; -+ entity->my_sched_data = &bfqg->sched_data; -+ bfqg->my_entity = entity; /* -+ * the root_group's will be set to NULL -+ * in bfq_init_queue() -+ */ -+ bfqg->bfqd = bfqd; -+ bfqg->active_entities = 0; -+ bfqg->rq_pos_tree = RB_ROOT; -+} -+ -+static void bfq_pd_free(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_exit(&bfqg->stats); -+#ifdef BFQ_MQ -+ bfqg_put(bfqg); -+#else -+ kfree(bfqg); -+#endif -+} -+ -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) -+{ -+ struct bfq_entity *entity; -+ -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; -+} -+ -+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct blkcg_gq *blkg; -+ -+ blkg = blkg_lookup(blkcg, bfqd->queue); -+ if (likely(blkg)) -+ return blkg_to_bfqg(blkg); -+ return NULL; -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct bfq_group *bfqg, *parent; -+ struct bfq_entity *entity; -+ -+ bfqg = bfq_lookup_bfqg(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ return NULL; -+ -+ /* -+ * Update chain of bfq_groups as we might be handling a leaf group -+ * which, along with some of its relatives, has not been hooked yet -+ * to the private hierarchy of BFQ. -+ */ -+ entity = &bfqg->entity; -+ for_each_entity(entity) { -+ bfqg = container_of(entity, struct bfq_group, entity); -+ BUG_ON(!bfqg); -+ if (bfqg != bfqd->root_group) { -+ parent = bfqg_parent(bfqg); -+ if (!parent) -+ parent = bfqd->root_group; -+ BUG_ON(!parent); -+ bfq_group_set_parent(bfqg, parent); -+ } -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/** -+ * bfq_bfqq_move - migrate @bfqq to @bfqg. -+ * @bfqd: queue descriptor. -+ * @bfqq: the queue to move. -+ * @bfqg: the group to move to. -+ * -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -+ * it on the new one. Avoid putting the entity on the old group idle tree. -+ * -+#ifdef BFQ_MQ -+ * Must be called under the scheduler lock, to make sure that the blkg -+ * owning @bfqg does not disappear (see comments in -+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg -+ * objects). -+#else -+ * Must be called under the queue lock; the cgroup owning @bfqg must -+ * not disappear (by now this just means that we are called under -+ * rcu_read_lock()). -+#endif -+ */ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); -+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) -+ && entity->on_st && -+ bfqq != bfqd->in_service_queue); -+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); -+ -+ /* If bfqq is empty, then bfq_bfqq_expire also invokes -+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity -+ * from data structures related to current group. Otherwise we -+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as -+ * we do below. -+ */ -+ if (bfqq == bfqd->in_service_queue) -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_busy(bfqq)) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ else if (entity->on_st) { -+ BUG_ON(&bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ } -+#ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); -+ -+ bfqg_and_blkg_put(bfqq_group(bfqq)); -+#else -+ bfqg_put(bfqq_group(bfqq)); -+#endif -+ -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+#ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); -+ -+ /* pin down bfqg and its associated blkg */ -+ bfqg_and_blkg_get(bfqg); -+#else -+ bfqg_get(bfqg); -+#endif -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ if (bfq_bfqq_busy(bfqq)) { -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+ -+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+} -+ -+/** -+ * __bfq_bic_change_cgroup - move @bic to @cgroup. -+ * @bfqd: the queue descriptor. -+ * @bic: the bic to move. -+ * @blkcg: the blk-cgroup to move to. -+ * -+#ifdef BFQ_MQ -+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes -+ * sure that the reference to cgroup is valid across the call (see -+ * comments in bfq_bic_update_cgroup on this issue) -+#else -+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller -+ * has to make sure that the reference to cgroup is valid across the call. -+#endif -+ * -+ * NOTE: an alternative approach might have been to store the current -+ * cgroup in bfqq and getting a reference to it, reducing the lookup -+ * time here, at the price of slightly more complex code. -+ */ -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct blkcg *blkcg) -+{ -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); -+ struct bfq_group *bfqg; -+ struct bfq_entity *entity; -+ -+ bfqg = bfq_find_set_group(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ bfqg = bfqd->root_group; -+ -+ if (async_bfqq) { -+ entity = &async_bfqq->entity; -+ -+ if (entity->sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, 0); -+ bfq_log_bfqq(bfqd, async_bfqq, -+ "%p %d", -+ async_bfqq, -+ async_bfqq->ref); -+ bfq_put_queue(async_bfqq); -+ } -+ } -+ -+ if (sync_bfqq) { -+ entity = &sync_bfqq->entity; -+ if (entity->sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_group *bfqg = NULL; -+ uint64_t serial_nr; -+ -+ rcu_read_lock(); -+ serial_nr = bio_blkcg(bio)->css.serial_nr; -+ -+ /* -+ * Check whether blkcg has changed. The condition may trigger -+ * spuriously on a newly created cic but there's no harm. -+ */ -+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) -+ goto out; -+ -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+#ifdef BFQ_MQ -+ /* -+ * Update blkg_path for bfq_log_* functions. We cache this -+ * path, and update it here, for the following -+ * reasons. Operations on blkg objects in blk-cgroup are -+ * protected with the request_queue lock, and not with the -+ * lock that protects the instances of this scheduler -+ * (bfqd->lock). This exposes BFQ to the following sort of -+ * race. -+ * -+ * The blkg_lookup performed in bfq_get_queue, protected -+ * through rcu, may happen to return the address of a copy of -+ * the original blkg. If this is the case, then the -+ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down -+ * the blkg, is useless: it does not prevent blk-cgroup code -+ * from destroying both the original blkg and all objects -+ * directly or indirectly referred by the copy of the -+ * blkg. -+ * -+ * On the bright side, destroy operations on a blkg invoke, as -+ * a first step, hooks of the scheduler associated with the -+ * blkg. And these hooks are executed with bfqd->lock held for -+ * BFQ. As a consequence, for any blkg associated with the -+ * request queue this instance of the scheduler is attached -+ * to, we are guaranteed that such a blkg is not destroyed, and -+ * that all the pointers it contains are consistent, while we -+ * are holding bfqd->lock. A blkg_lookup performed with -+ * bfqd->lock held then returns a fully consistent blkg, which -+ * remains consistent until this lock is held. -+ * -+ * Thanks to the last fact, and to the fact that: (1) bfqg has -+ * been obtained through a blkg_lookup in the above -+ * assignment, and (2) bfqd->lock is being held, here we can -+ * safely use the policy data for the involved blkg (i.e., the -+ * field bfqg->pd) to get to the blkg associated with bfqg, -+ * and then we can safely use any field of blkg. After we -+ * release bfqd->lock, even just getting blkg through this -+ * bfqg may cause dangling references to be traversed, as -+ * bfqg->pd may not exist any more. -+ * -+ * In view of the above facts, here we cache, in the bfqg, any -+ * blkg data we may need for this bic, and for its associated -+ * bfq_queue. As of now, we need to cache only the path of the -+ * blkg, which is used in the bfq_log_* functions. -+ * -+ * Finally, note that bfqg itself needs to be protected from -+ * destruction on the blkg_free of the original blkg (which -+ * invokes bfq_pd_free). We use an additional private -+ * refcounter for bfqg, to let it disappear only after no -+ * bfq_queue refers to it any longer. -+ */ -+ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); -+#endif -+ bic->blkcg_serial_nr = serial_nr; -+out: -+ rcu_read_unlock(); -+} -+ -+/** -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. -+ * @st: the service tree being flushed. -+ */ -+static void bfq_flush_idle_tree(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entity = st->first_idle; -+ -+ for (; entity ; entity = st->first_idle) -+ __bfq_deactivate_entity(entity, false); -+} -+ -+/** -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. -+ * @bfqd: the device data structure with the root group. -+ * @entity: the entity to move. -+ */ -+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(!bfqq); -+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); -+} -+ -+/** -+ * bfq_reparent_active_entities - move to the root group all active -+ * entities. -+ * @bfqd: the device data structure with the root group. -+ * @bfqg: the group to move from. -+ * @st: the service tree with the entities. -+ */ -+static void bfq_reparent_active_entities(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ struct bfq_service_tree *st) -+{ -+ struct rb_root *active = &st->active; -+ struct bfq_entity *entity = NULL; -+ -+ if (!RB_EMPTY_ROOT(&st->active)) -+ entity = bfq_entity_of(rb_first(active)); -+ -+ for (; entity ; entity = bfq_entity_of(rb_first(active))) -+ bfq_reparent_leaf_entity(bfqd, entity); -+ -+ if (bfqg->sched_data.in_service_entity) -+ bfq_reparent_leaf_entity(bfqd, -+ bfqg->sched_data.in_service_entity); -+} -+ -+/** -+ * bfq_pd_offline - deactivate the entity associated with @pd, -+ * and reparent its children entities. -+ * @pd: descriptor of the policy going offline. -+ * -+ * blkio already grabs the queue_lock for us, so no need to use -+ * RCU-based magic -+ */ -+static void bfq_pd_offline(struct blkg_policy_data *pd) -+{ -+ struct bfq_service_tree *st; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+#ifdef BFQ_MQ -+ unsigned long flags; -+#endif -+ int i; -+ -+ BUG_ON(!pd); -+ bfqg = pd_to_bfqg(pd); -+ BUG_ON(!bfqg); -+ bfqd = bfqg->bfqd; -+ BUG_ON(bfqd && !bfqd->root_group); -+ -+ entity = bfqg->my_entity; -+ -+#ifdef BFQ_MQ -+ spin_lock_irqsave(&bfqd->lock, flags); -+#endif -+ -+ if (!entity) /* root group */ -+ goto put_async_queues; -+ -+ /* -+ * Empty all service_trees belonging to this group before -+ * deactivating the group itself. -+ */ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ BUG_ON(!bfqg->sched_data.service_tree); -+ st = bfqg->sched_data.service_tree + i; -+ /* -+ * The idle tree may still contain bfq_queues belonging -+ * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. -+ */ -+ bfq_flush_idle_tree(st); -+ -+ /* -+ * It may happen that some queues are still active -+ * (busy) upon group destruction (if the corresponding -+ * processes have been forced to terminate). We move -+ * all the leaf entities corresponding to these queues -+ * to the root_group. -+ * Also, it may happen that the group has an entity -+ * in service, which is disconnected from the active -+ * tree: it must be moved, too. -+ * There is no need to put the sync queues, as the -+ * scheduler has taken no reference. -+ */ -+ bfq_reparent_active_entities(bfqd, bfqg, st); -+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); -+ } -+ BUG_ON(bfqg->sched_data.next_in_service); -+ BUG_ON(bfqg->sched_data.in_service_entity); -+ -+ __bfq_deactivate_entity(entity, false); -+ -+put_async_queues: -+ bfq_put_async_queues(bfqd, bfqg); -+ -+#ifdef BFQ_MQ -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+#endif -+ /* -+ * @blkg is going offline and will be ignored by -+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -+ * that they don't get lost. If IOs complete after this point, the -+ * stats for them will be lost. Oh well... -+ */ -+ bfqg_stats_xfer_dead(bfqg); -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ struct blkcg_gq *blkg; -+ -+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ BUG_ON(!bfqg); -+ -+ bfq_end_wr_async_queues(bfqd, bfqg); -+ } -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static int bfq_io_show_weight(struct seq_file *sf, void *v) -+{ -+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ unsigned int val = 0; -+ -+ if (bfqgd) -+ val = bfqgd->weight; -+ -+ seq_printf(sf, "%u\n", val); -+ -+ return 0; -+} -+ -+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ struct blkcg_gq *blkg; -+ int ret = -ERANGE; -+ -+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) -+ return ret; -+ -+ ret = 0; -+ spin_lock_irq(&blkcg->lock); -+ bfqgd->weight = (unsigned short)val; -+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ if (!bfqg) -+ continue; -+ /* -+ * Setting the prio_changed flag of the entity -+ * to 1 with new_weight == weight would re-set -+ * the value of the weight to its ioprio mapping. -+ * Set the flag only if necessary. -+ */ -+ if ((unsigned short)val != bfqg->entity.new_weight) { -+ bfqg->entity.new_weight = (unsigned short)val; -+ /* -+ * Make sure that the above new value has been -+ * stored in bfqg->entity.new_weight before -+ * setting the prio_changed flag. In fact, -+ * this flag may be read asynchronously (in -+ * critical sections protected by a different -+ * lock than that held here), and finding this -+ * flag set may cause the execution of the code -+ * for updating parameters whose value may -+ * depend also on bfqg->entity.new_weight (in -+ * __bfq_entity_update_weight_prio). -+ * This barrier makes sure that the new value -+ * of bfqg->entity.new_weight is correctly -+ * seen in that code. -+ */ -+ smp_wmb(); -+ bfqg->entity.prio_changed = 1; -+ } -+ } -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) -+{ -+ u64 weight; -+ /* First unsigned long found in the file is used */ -+ int ret = kstrtoull(strim(buf), 0, &weight); -+ -+ if (ret) -+ return ret; -+ -+ ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); -+ return ret ?: nbytes; -+} -+ -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+static int bfqg_print_stat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, off); -+ return __blkg_prfill_u64(sf, pd, sum); -+} -+ -+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, -+ off); -+ return __blkg_prfill_rwstat(sf, pd, &sum); -+} -+ -+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, -+ int off) -+{ -+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, -+ offsetof(struct blkcg_gq, stat_bytes)); -+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + -+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, -+ false); -+ return 0; -+} -+ -+ -+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); -+ u64 v = 0; -+ -+ if (samples) { -+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); -+ v = div64_u64(v, samples); -+ } -+ __blkg_prfill_u64(sf, pd, v); -+ return 0; -+} -+ -+/* print avg_queue_size */ -+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, -+ 0, false); -+ return 0; -+} -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ int ret; -+ -+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); -+ if (ret) -+ return NULL; -+ -+ return blkg_to_bfqg(bfqd->queue->root_blkg); -+} -+ -+#ifdef BFQ_MQ -+#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param -+#else -+#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param -+#endif -+ -+static struct cftype bfq_blkcg_legacy_files[] = { -+ { -+ .name = BFQ_CGROUP_FNAME(weight), -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write_u64 = bfq_io_set_weight_legacy, -+ }, -+ -+ /* statistics, covers only the tasks in the bfqg */ -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_bytes), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_serviced), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios, -+ }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors), -+ .seq_show = bfqg_print_stat_sectors, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_time), -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_wait_time), -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_merged), -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_queued), -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat, -+ }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ -+ -+ /* the same statictics which cover the bfqg and its descendants */ -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios_recursive, -+ }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time_recursive), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_merged_recursive), -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(io_queued_recursive), -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(avg_queue_size), -+ .seq_show = bfqg_print_avg_queue_size, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(group_wait_time), -+ .private = offsetof(struct bfq_group, stats.group_wait_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(idle_time), -+ .private = offsetof(struct bfq_group, stats.idle_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(empty_time), -+ .private = offsetof(struct bfq_group, stats.empty_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(dequeue), -+ .private = offsetof(struct bfq_group, stats.dequeue), -+ .seq_show = bfqg_print_stat, -+ }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ -+ { } /* terminate */ -+}; -+ -+static struct cftype bfq_blkg_files[] = { -+ { -+ .name = BFQ_CGROUP_FNAME(weight), -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write = bfq_io_set_weight, -+ }, -+ {} /* terminate */ -+}; -+ -+#undef BFQ_CGROUP_FNAME -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) {} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ } -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ return bfqd->root_group; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ struct bfq_group *bfqg; -+ int i; -+ -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); -+ if (!bfqg) -+ return NULL; -+ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ -+ return bfqg; -+} -+#endif -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c -new file mode 100644 -index 000000000000..fb7bb8f08b75 ---- /dev/null -+++ b/block/bfq-ioc.c -@@ -0,0 +1,36 @@ -+/* -+ * BFQ: I/O context handling. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> -+ */ -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * -+ * Queue lock must be held. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc) -+{ -+ if (ioc) -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); -+ return NULL; -+} -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -new file mode 100644 -index 000000000000..47a49d9e6512 ---- /dev/null -+++ b/block/bfq-mq-iosched.c -@@ -0,0 +1,6548 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * In particular, BFQ schedules I/O so as to achieve the latter goal-- -+ * low latency for interactive and soft real-time applications--if the -+ * low_latency parameter is set (default configuration). To this -+ * purpose, BFQ constantly tries to detect whether the I/O requests in -+ * a bfq_queue come from an interactive or a soft real-time -+ * application. For brevity, in these cases, the queue is said to be -+ * interactive or soft real-time. In both cases, BFQ privileges the -+ * service of the queue, over that of non-interactive and -+ * non-soft-real-time queues. This privileging is performed, mainly, -+ * by raising the weight of the queue. So, for brevity, we call just -+ * weight-raising periods the time periods during which a queue is -+ * privileged, because deemed interactive or soft real-time. -+ * -+ * The detection of soft real-time queues/applications is described in -+ * detail in the comments on the function -+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an -+ * interactive queue works as follows: a queue is deemed interactive -+ * if it is constantly non empty only for a limited time interval, -+ * after which it does become empty. The queue may be deemed -+ * interactive again (for a limited time), if it restarts being -+ * constantly non empty, provided that this happens only after the -+ * queue has remained empty for a given minimum idle time. -+ * -+ * By default, BFQ computes automatically the above maximum time -+ * interval, i.e., the time interval after which a constantly -+ * non-empty queue stops being deemed interactive. Since a queue is -+ * weight-raised while it is deemed interactive, this maximum time -+ * interval happens to coincide with the (maximum) duration of the -+ * weight-raising for interactive queues. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, -+ * more theoretical paper on BFQ can be found. The interested reader -+ * can find in the latter paper full details on the main algorithm, as -+ * well as formulas of the guarantees and formal proofs of all the -+ * properties. With respect to the version of BFQ presented in these -+ * papers, this implementation adds a few more heuristics, such as the -+ * one that guarantees a low latency to soft real-time applications, -+ * and a hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include <linux/sbitmap.h> -+#include <linux/delay.h> -+ -+#include "blk.h" -+#include "blk-mq.h" -+#include "blk-mq-tag.h" -+#include "blk-mq-sched.h" -+#include "bfq-mq.h" -+#include "blk-wbt.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * When a sync request is dispatched, the queue that contains that -+ * request, and all the ancestor entities of that queue, are charged -+ * with the number of sectors of the request. In constrast, if the -+ * request is async, then the queue and its ancestor entities are -+ * charged with the number of sectors of the request, multiplied by -+ * the factor below. This throttles the bandwidth for async I/O, -+ * w.r.t. to sync I/O, and it is done to counter the tendency of async -+ * writes to steal I/O throughput to reads. -+ * -+ * The current value of this parameter is the result of a tuning with -+ * several hardware and software configurations. We tried to find the -+ * lowest value for which writes do not cause noticeable problems to -+ * reads. In fact, the lower this parameter, the stabler I/O control, -+ * in the following respect. The lower this parameter is, the less -+ * the bandwidth enjoyed by a group decreases -+ * - when the group does writes, w.r.t. to when it does reads; -+ * - when other groups do reads, w.r.t. to when they do writes. -+ */ -+static const int bfq_async_charge_factor = 3; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 3 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ -+ (get_sdist(last_pos, rq) > \ -+ BFQQ_SEEK_THR && \ -+ (!blk_queue_nonrot(bfqd->queue) || \ -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * When configured for computing the duration of the weight-raising -+ * for interactive queues automatically (see the comments at the -+ * beginning of this file), BFQ does it using the following formula: -+ * duration = (ref_rate / r) * ref_wr_duration, -+ * where r is the peak rate of the device, and ref_rate and -+ * ref_wr_duration are two reference parameters. In particular, -+ * ref_rate is the peak rate of the reference storage device (see -+ * below), and ref_wr_duration is about the maximum time needed, with -+ * BFQ and while reading two files in parallel, to load typical large -+ * applications on the reference device (see the comments on -+ * max_service_from_wr below, for more details on how ref_wr_duration -+ * is obtained). In practice, the slower/faster the device at hand -+ * is, the more/less it takes to load applications with respect to the -+ * reference device. Accordingly, the longer/shorter BFQ grants -+ * weight raising to interactive applications. -+ * -+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), -+ * depending on whether the device is rotational or non-rotational. -+ * -+ * In the following definitions, ref_rate[0] and ref_wr_duration[0] -+ * are the reference values for a rotational device, whereas -+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a -+ * non-rotational device. The reference rates are not the actual peak -+ * rates of the devices used as a reference, but slightly lower -+ * values. The reason for using slightly lower values is that the -+ * peak-rate estimator tends to yield slightly lower values than the -+ * actual peak rate (it can yield the actual peak rate only if there -+ * is only one process doing I/O, and the process does sequential -+ * I/O). -+ * -+ * The reference peak rates are measured in sectors/usec, left-shifted -+ * by BFQ_RATE_SHIFT. -+ */ -+static int ref_rate[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize -+ * the following array, which entails that the array can be -+ * initialized only in a function. -+ */ -+static int ref_wr_duration[2]; -+ -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transferred. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * @q: the request queue. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc, -+ struct request_queue *q) -+{ -+ if (ioc) { -+ unsigned long flags; -+ struct bfq_io_cq *icq; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return icq; -+ } -+ -+ return NULL; -+} -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, ""); -+ blk_mq_run_hw_queues(bfqd->queue, true); -+ } -+} -+ -+#define BFQ_MQ -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+/* -+ * Async I/O can easily starve sync I/O (both sync reads and sync -+ * writes), by consuming all tags. Similarly, storms of sync writes, -+ * such as those that sync(2) may trigger, can starve sync reads. -+ * Limit depths of async I/O and sync writes so as to counter both -+ * problems. -+ */ -+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) -+{ -+ struct bfq_data *bfqd = data->q->elevator->elevator_data; -+ -+ if (op_is_sync(op) && !op_is_write(op)) -+ return; -+ -+ data->shallow_depth = -+ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; -+ -+ bfq_log(bfqd, "wr_busy %d sync %d depth %u", -+ bfqd->wr_busy_queues, op_is_sync(op), -+ data->shallow_depth); -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "%llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_better_to_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 4) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly -+ * the last two symmetry sub-conditions above would be quite complex -+ * and time consuming. Therefore this function evaluates, instead, -+ * only the following stronger three sub-conditions, for which it is -+ * much easier to maintain the needed state: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) there are no active groups. -+ * In particular, the last condition is always true if hierarchical -+ * support or the cgroups interface are not enabled, thus no state -+ * needs to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ /* -+ * For queue weights to differ, queue_weights_tree must contain -+ * at least two nodes. -+ */ -+ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right); -+ -+ bool multiple_classes_busy = -+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || -+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || -+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); -+ -+ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", -+ varied_queue_weights, multiple_classes_busy); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log(bfqd, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+#endif -+ -+ return !(varied_queue_weights || multiple_classes_busy -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ || bfqd->num_groups_with_pending_reqs > 0 -+#endif -+ ); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input queue, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the queue is already associated with a -+ * counter, which happens if: -+ * 1) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 2) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (bfqq->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ bfqq->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of queue to not be -+ * considered in bfq_symmetric_scenario, which, in its turn, -+ * causes the scenario to be deemed wrongly symmetric in case -+ * bfqq's weight would have been the only weight making the -+ * scenario asymmetric. On the bright side, no unbalance will -+ * however occur when bfqq becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of queue). In fact, bfq_weights_tree_remove does nothing -+ * if !bfqq->weight_counter. -+ */ -+ if (unlikely(!bfqq->weight_counter)) -+ return; -+ -+ bfqq->weight_counter->weight = entity->weight; -+ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); -+ rb_insert_color(&bfqq->weight_counter->weights_node, root); -+ -+inc_counter: -+ bfqq->weight_counter->num_active++; -+ bfqq->ref++; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+} -+ -+/* -+ * Decrement the weight counter associated with the queue, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void __bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (!bfqq->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(bfqq->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!bfqq->weight_counter->num_active); -+ bfqq->weight_counter->num_active--; -+ -+ if (bfqq->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&bfqq->weight_counter->weights_node, root); -+ kfree(bfqq->weight_counter); -+ -+reset_entity_pointer: -+ bfqq->weight_counter = NULL; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+ bfq_put_queue(bfqq); -+} -+ -+/* -+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number -+ * of active groups for each queue's inactive parent entity. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = bfqq->entity.parent; -+ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->my_sched_data; -+ -+ BUG_ON(entity->sched_data == NULL); /* -+ * It would mean -+ * that this is -+ * the root group. -+ */ -+ -+ if (sd->next_in_service || sd->in_service_entity) { -+ BUG_ON(!entity->in_groups_with_pending_reqs); -+ /* -+ * entity is still active, because either -+ * next_in_service or in_service_entity is not -+ * NULL (see the comments on the definition of -+ * next_in_service for details on why -+ * in_service_entity must be checked too). -+ * -+ * As a consequence, its parent entities are -+ * active as well, and thus this loop must -+ * stop here. -+ */ -+ break; -+ } -+ -+ BUG_ON(!bfqd->num_groups_with_pending_reqs && -+ entity->in_groups_with_pending_reqs); -+ /* -+ * The decrement of num_groups_with_pending_reqs is -+ * not performed immediately upon the deactivation of -+ * entity, but it is delayed to when it also happens -+ * that the first leaf descendant bfqq of entity gets -+ * all its pending requests completed. The following -+ * instructions perform this delayed decrement, if -+ * needed. See the comments on -+ * num_groups_with_pending_reqs for details. -+ */ -+ if (entity->in_groups_with_pending_reqs) { -+ entity->in_groups_with_pending_reqs = false; -+ bfqd->num_groups_with_pending_reqs--; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+ } -+ -+ /* -+ * Next function is invoked last, because it causes bfqq to be -+ * freed if the following holds: bfqq is not in service and -+ * has no dispatched request. DO NOT use bfqq after the next -+ * function invocation. -+ */ -+ __bfq_weights_tree_remove(bfqd, bfqq, -+ &bfqd->queue_weights_tree); -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqq->bfqd)) -+ return blk_rq_sectors(rq); -+ -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, -+ max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)), -+ entity->service); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq, false); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->rate_dur_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 25 seconds. The upper limit -+ * has been conservatively set after the following worst case: -+ * on a QEMU/KVM virtual machine -+ * - running in a slow PC -+ * - with a virtual disk stacked on a slow low-end 5400rpm HDD -+ * - serving a heavy I/O workload, such as the sequential reading -+ * of several files -+ * mplayer took 23 seconds to start, if constantly weight-raised. -+ * -+ * As for higher values than that accomodating the above bad -+ * scenario, tests show that higher values would often yield -+ * the opposite of the desired result, i.e., would worsen -+ * responsiveness by allowing non-interactive applications to -+ * preserve weight raising for too long. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); -+} -+ -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->ttime = bic->saved_ttime; -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ } -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(&bfqq->bfqd->lock); -+ -+ io_refs = bfqq->allocated; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - -+ (bfqq->weight_counter != NULL); -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (entity->budget < entity->service) { -+ pr_crit("budget %d service %d\n", -+ entity->budget, entity->service); -+ BUG(); -+ } -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ /* -+ * In the next compound condition, we check also whether there -+ * is some budget left, because otherwise there is no point in -+ * trying to go on serving bfqq with this same budget: bfqq -+ * would be expired immediately after being selected for -+ * service. This would only cause useless overhead. -+ */ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && -+ bfq_bfqq_budget_left(bfqq) > 0) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ -+ /* -+ * At this point, we have used entity->service to get -+ * the budget left (needed for updating -+ * entity->budget). Thus we finally can, and have to, -+ * reset entity->service. The latter must be reset -+ * because bfqq would otherwise be charged again for -+ * the service it has received during its previous -+ * service slot(s). -+ */ -+ entity->service = 0; -+ -+ return true; -+ } -+ -+ /* -+ * We can finally complete expiration, by setting service to 0. -+ */ -+ entity->service = 0; -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->service_from_wr = 0; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ bfqq->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start) && -+ bfqq->dispatched == 0; -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(RQ_BFQQ(rq) != bfqq); -+ WARN_ON(blk_rq_sectors(rq) == 0); -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ BUG_ON(!RQ_BFQQ(next_rq)); -+ BUG_ON(RQ_BFQQ(next_rq) != bfqq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio, -+ struct request_queue *q) -+{ -+ struct bfq_queue *bfqq = bfqd->bio_bfqq; -+ -+ BUG_ON(!bfqd->bio_bfqq_set); -+ -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+#if 0 /* Still not clear if we can do without next two functions */ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+#endif -+ -+static void bfq_remove_request(struct request_queue *q, -+ struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { -+ pr_crit("no bfqq! for next rq %p bfqq %p\n", -+ bfqq->next_rq, bfqq); -+ } -+ -+ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); -+ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { -+ pr_crit( -+ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", -+ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); -+ } -+ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); -+ -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ elv_rqhash_del(q, rq); -+ if (q->last_merge == rq) -+ q->last_merge = NULL; -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+} -+ -+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *free = NULL; -+ /* -+ * bfq_bic_lookup grabs the queue_lock: invoke it now and -+ * store its return value for later use, to avoid nesting -+ * queue_lock inside the bfqd->lock. We assume that the bic -+ * returned by bfq_bic_lookup does not go away before -+ * bfqd->lock is taken. -+ */ -+ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); -+ bool ret; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ if (bic) -+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ else -+ bfqd->bio_bfqq = NULL; -+ bfqd->bio_bic = bic; -+ /* Set next flag just for testing purposes */ -+ bfqd->bio_bfqq_set = true; -+ -+ ret = blk_mq_sched_try_merge(q, bio, &free); -+ -+ /* -+ * XXX Not yet freeing without lock held, to avoid an -+ * inconsistency with respect to the lock-protected invocation -+ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting -+ * for clarifications from Jens. -+ */ -+ if (free) -+ blk_mq_free_request(free); -+ bfqd->bio_bfqq_set = false; -+ spin_unlock_irq(&bfqd->lock); -+ -+ return ret; -+} -+ -+static int bfq_request_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio, q); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ bfq_log(bfqd, "req %p", __rq); -+ -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static struct bfq_queue *bfq_init_rq(struct request *rq); -+ -+static void bfq_request_merged(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ BUG_ON(req->rq_flags & RQF_DISP_LIST); -+ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = bfq_init_rq(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ BUG_ON(!RQ_BFQQ(req)); -+ BUG_ON(RQ_BFQQ(req) != bfqq); -+ elv_rb_add(&bfqq->sort_list, req); -+ -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ -+ bfqq->next_rq = next_rq; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "req %p prev %p next_rq %p bfqq %p", -+ req, prev, next_rq, bfqq); -+ -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+/* -+ * This function is called to notify the scheduler that the requests -+ * rq and 'next' have been merged, with 'next' going away. BFQ -+ * exploits this hook to address the following issue: if 'next' has a -+ * fifo_time lower that rq, then the fifo_time of rq must be set to -+ * the value of 'next', to not forget the greater age of 'next'. -+ * -+ * NOTE: in this function we assume that rq is in a bfq_queue, basing -+ * on that rq is picked from the hash table q->elevator->hash, which, -+ * in its turn, is filled only with I/O requests present in -+ * bfq_queues, while BFQ is in use for the request queue q. In fact, -+ * the function that fills this hash table (elv_rqhash_add) is called -+ * only by bfq_insert_request. -+ */ -+static void bfq_requests_merged(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = bfq_init_rq(rq), -+ *next_bfqq = bfq_init_rq(next); -+ -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(!RQ_BFQQ(next)); /* this does not imply next is in a bfqq */ -+ BUG_ON(rq->rq_flags & RQF_DISP_LIST); -+ BUG_ON(next->rq_flags & RQF_DISP_LIST); -+ -+ lockdep_assert_held(&bfqq->bfqd->lock); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "rq %p next %p bfqq %p next_bfqq %p", -+ rq, next, bfqq, next_bfqq); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(&bfqd->lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because -+ * we are in the context of the process owning bfqq, thus we -+ * have the io_cq of this process. So we can immediately -+ * configure this io_cq to redirect the requests of the -+ * process to new_bfqq. In contrast, the io_cq of new_bfqq is -+ * not available any more (new_bfqq->bic == NULL). -+ * -+ * Anyway, even in case new_bfqq coincides with the in-service -+ * queue, redirecting requests the in-service queue is the -+ * best option, as we feed the in-service queue with new -+ * requests close to the last request served and, by doing so, -+ * are likely to increase the throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); -+ return false; -+ } -+ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfq_tot_busy_queues(bfqd) == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_ttime = bfqq->ttime; -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > -+ bfq_tot_busy_queues(bfqd)); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; -+ -+ assert_spin_locked(&bfqd->lock); -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ */ -+ BUG_ON(!bfqd->bio_bfqq_set); -+ if (!bfqq) -+ return false; -+ -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ BUG_ON(new_bfqq == bfqq); -+ if (new_bfqq) { -+ /* -+ * bic still points to bfqq, then it has not yet been -+ * redirected to some other bfq_queue, and a queue -+ * merge beween bfqq and new_bfqq can be safely -+ * fulfillled, i.e., bic can be redirected to new_bfqq -+ * and bfqq can be put. -+ */ -+ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, -+ new_bfqq); -+ /* -+ * If we get here, bio will be queued into new_queue, -+ * so use new_bfqq to decide whether bio and rq can be -+ * merged. -+ */ -+ bfqq = new_bfqq; -+ -+ /* -+ * Change also bqfd->bio_bfqq, as -+ * bfqd->bio_bic now points to new_bfqq, and -+ * this function may be invoked again (and then may -+ * use again bqfd->bio_bfqq). -+ */ -+ bfqd->bio_bfqq = bfqq; -+ } -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "cur-budget = %d prio_class %d", -+ bfqq->entity.budget, bfqq->ioprio_class); -+ } else -+ bfq_log(bfqd, "NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on the ref_wr_duration array. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ if (RQ_BFQQ(rq) == bfqd->in_service_queue) -+ bfqd->in_serv_last_pos = bfqd->last_position; -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Remove request from internal lists. -+ */ -+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been -+ * executed after removing the request from the queue and -+ * dispatching it. We execute instead this instruction before -+ * bfq_remove_request() (and hence introduce a temporary -+ * inconsistency), for efficiency. In fact, should this -+ * dispatch occur for a non in-service bfqq, this anticipated -+ * increment prevents two counters related to bfqq->dispatched -+ * from risking to be, first, uselessly decremented, and then -+ * incremented again when the (new) value of bfqq->dispatched -+ * happens to be taken into account. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq, true); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. -+ * -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) -+{ -+ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ blk_queue_nonrot(bfqq->bfqd->queue) && -+ bfqq->bfqd->hw_tag; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. And we do it, unless bfqq is in -+ * interactive weight raising. We do not do it in the -+ * latter subcase, for the following reason. bfqq may -+ * be conveying the I/O needed to load a soft -+ * real-time application. Such an application will -+ * actually exhibit a soft real-time I/O pattern after -+ * it finally starts doing its job. But, if -+ * soft_rt_next_start is computed here for an -+ * interactive bfqq, and bfqq had received a lot of -+ * service before remaining with no outstanding -+ * request (likely to happen on a fast device), then -+ * soft_rt_next_start would be assigned such a high -+ * value that, for a very long time, bfqq would be -+ * prevented from being possibly considered as soft -+ * real time. -+ * -+ * If, instead, the queue still has outstanding -+ * requests, then we have to wait for the completion -+ * of all the outstanding requests to discover whether -+ * the request pattern is actually isochronous. -+ */ -+ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); -+ if (bfqq->dispatched == 0 && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else if (bfqq->dispatched > 0) { -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", -+ reason_name[reason], slow, bfqq->dispatched, -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight, -+ entity->service, entity->budget); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ if (ref == 1) /* bfqq is gone, no more actions on it */ -+ return; -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ bfqq->injected_service = 0; -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (!bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(bfqq->next_rq); -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+ /* -+ * Not setting service to 0, because, if the next rq -+ * arrives in time, the queue will go on receiving -+ * service with this same budget (as if it never expired) -+ */ -+ } else { -+ entity->service = 0; -+ bfq_log_bfqq(bfqd, bfqq, "resetting service"); -+ } -+ -+ /* -+ * Reset the received-service counter for every parent entity. -+ * Differently from what happens with bfqq->entity.service, -+ * the resetting of this counter never needs to be postponed -+ * for parent entities. In fact, in case bfqq may have a -+ * chance to go on being served using the last, partially -+ * consumed budget, bfqq->entity.service needs to be kept, -+ * because if bfqq then actually goes on being served using -+ * the same budget, the last value of bfqq->entity.service is -+ * needed to properly decrement bfqq->entity.budget by the -+ * portion already consumed. In contrast, it is not necessary -+ * to keep entity->service for parent entities too, because -+ * the bubble up of the new value of bfqq->entity.budget will -+ * make sure that the budgets of parent entities are correct, -+ * even in case bfqq and thus parent entities go on receiving -+ * service with the same budget. -+ */ -+ entity = entity->parent; -+ for_each_entity(entity) -+ entity->service = 0; -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. -+ */ -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); -+ -+ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); -+ -+ /* -+ * The return value of this function is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the return value if -+ * there are weight-raised busy queues. In this case, and if -+ * bfqq is not weight-raised, this guarantees that the device -+ * is not idled for bfqq (if, instead, bfqq is weight-raised, -+ * then idling will be guaranteed by another variable, see -+ * below). Combined with the timestamping rules of BFQ (see -+ * [1] for details), this behavior causes bfqq, and hence any -+ * sync non-weight-raised queue, to get a lower number of -+ * requests served, and thus to ask for a lower number of -+ * requests from the request pool, before the busy -+ * weight-raised queues get served again. This often mitigates -+ * starvation problems in the presence of heavy write -+ * workloads and NCQ, thereby guaranteeing a higher -+ * application and system responsiveness in these hostile -+ * scenarios. -+ */ -+ return idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+} -+ -+/* -+ * There is a case where idling must be performed not for -+ * throughput concerns, but to preserve service guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) the I/O of each process has the same properties, in -+ * terms of locality (sequential or random), direction -+ * (reads or writes), request sizes, greediness -+ * (from I/O-bound to sporadic), and so on. -+ * In fact, in such a scenario, the drive tends to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * The problem is that idling may significantly reduce -+ * throughput with certain combinations of types of I/O and -+ * devices. An important example is sync random I/O, on flash -+ * storage with command queueing. So, unless bfqq falls in the -+ * above cases where idling also boosts throughput, it would -+ * be important to check conditions (i) and (ii) accurately, -+ * so as to avoid idling when not strictly needed for service -+ * guarantees. -+ * -+ * Unfortunately, it is extremely difficult to thoroughly -+ * check condition (ii). And, in case there are active groups, -+ * it becomes very difficult to check condition (i) too. In -+ * fact, if there are active groups, then, for condition (i) -+ * to become false, it is enough that an active group contains -+ * more active processes or sub-groups than some other active -+ * group. More precisely, for condition (i) to hold because of -+ * such a group, it is not even necessary that the group is -+ * (still) active: it is sufficient that, even if the group -+ * has become inactive, some of its descendant processes still -+ * have some request already dispatched but still waiting for -+ * completion. In fact, requests have still to be guaranteed -+ * their share of the throughput even after being -+ * dispatched. In this respect, it is easy to show that, if a -+ * group frequently becomes inactive while still having -+ * in-flight requests, and if, when this happens, the group is -+ * not considered in the calculation of whether the scenario -+ * is asymmetric, then the group may fail to be guaranteed its -+ * fair share of the throughput (basically because idling may -+ * not be performed for the descendant processes of the group, -+ * but it had to be). We address this issue with the -+ * following bi-modal behavior, implemented in the function -+ * bfq_symmetric_scenario(). -+ * -+ * If there are groups with requests waiting for completion -+ * (as commented above, some of these groups may even be -+ * already inactive), then the scenario is tagged as -+ * asymmetric, conservatively, without checking any of the -+ * conditions (i) and (ii). So the device is idled for bfqq. -+ * This behavior matches also the fact that groups are created -+ * exactly if controlling I/O is a primary concern (to -+ * preserve bandwidth and latency guarantees). -+ * -+ * On the opposite end, if there are no groups with requests -+ * waiting for completion, then only condition (i) is actually -+ * controlled, i.e., provided that condition (i) holds, idling -+ * is not performed, regardless of whether condition (ii) -+ * holds. In other words, only if condition (i) does not hold, -+ * then idling is allowed, and the device tends to be -+ * prevented from queueing many requests, possibly of several -+ * processes. Since there are no groups with requests waiting -+ * for completion, then, to control condition (i) it is enough -+ * to check just whether all the queues with requests waiting -+ * for completion also have the same weight. -+ * -+ * Not checking condition (ii) evidently exposes bfqq to the -+ * risk of getting less throughput than its fair share. -+ * However, for queues with the same weight, a further -+ * mechanism, preemption, mitigates or even eliminates this -+ * problem. And it does so without consequences on overall -+ * throughput. This mechanism and its benefits are explained -+ * in the next three paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * The motivation for using preemption instead of idling (for -+ * queues with the same weight) is that, by not idling, -+ * service guarantees are preserved (completely or at least in -+ * part) without minimally sacrificing throughput. And, if -+ * there is no active group, then the primary expectation for -+ * this device is probably a high throughput. -+ * -+ * We are now left only with explaining the additional -+ * compound condition that is checked below for deciding -+ * whether the scenario is asymmetric. To explain this -+ * compound condition, we need to add that the function -+ * bfq_symmetric_scenario checks the weights of only -+ * non-weight-raised queues, for efficiency reasons (see -+ * comments on bfq_weights_tree_add()). Then the fact that -+ * bfqq is weight-raised is checked explicitly here. More -+ * precisely, the compound condition below takes into account -+ * also the fact that, even if bfqq is being weight-raised, -+ * the scenario is still symmetric if all queues with requests -+ * waiting for completion happen to be -+ * weight-raised. Actually, we should be even more precise -+ * here, and differentiate between interactive weight raising -+ * and soft real-time weight raising. -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && -+ bfqd->wr_busy_queues < -+ bfq_tot_busy_queues(bfqd)) || -+ !bfq_symmetric_scenario(bfqd); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_coeff %d wr_busy %d busy %d asymmetric %d", -+ bfqq->wr_coeff, -+ bfqd->wr_busy_queues, -+ bfq_tot_busy_queues(bfqd), -+ asymmetric_scenario); -+ -+ return asymmetric_scenario; -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * Most of the issues taken into account to get the return value of -+ * this function are not trivial. We discuss these issues in the two -+ * functions providing the main pieces of information needed by this -+ * function. -+ */ -+static bool bfq_better_to_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; -+ -+ if (unlikely(bfqd->strict_guarantees)) -+ return true; -+ -+ /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ idling_boosts_thr_with_no_issue = -+ idling_boosts_thr_without_issues(bfqd, bfqq); -+ -+ idling_needed_for_service_guar = -+ idling_needed_for_service_guarantees(bfqd, bfqq); -+ -+ /* -+ * We have now the two components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_with_no_issue, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guar); -+ -+ return idling_boosts_thr_with_no_issue || -+ idling_needed_for_service_guar; -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_better_to_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_better_to_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_better_to_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); -+} -+ -+static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * A linear search; but, with a high probability, very few -+ * steps are needed to find a candidate queue, i.e., a queue -+ * with enough budget left for its next request. In fact: -+ * - BFQ dynamically updates the budget of every queue so as -+ * to accomodate the expected backlog of the queue; -+ * - if a queue gets all its requests dispatched as injected -+ * service, then the queue is removed from the active list -+ * (and re-added only if it gets new requests, but with -+ * enough budget for its new backlog). -+ */ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= -+ bfq_bfqq_budget_left(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ return bfqq; -+ } -+ -+ bfq_log(bfqd, "no queue found"); -+ return NULL; -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); -+ -+ /* -+ * Do not expire bfqq for budget timeout if bfqq may be about -+ * to enjoy device idling. The reason why, in this case, we -+ * prevent bfqq from expiring is the same as in the comments -+ * on the case where bfq_bfqq_must_idle() returns true, in -+ * bfq_completed_request(). -+ */ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ * -+ * Yet, to boost throughput, inject service from other queues if -+ * possible. -+ */ -+ if (bfq_bfqq_wait_request(bfqq) || -+ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { -+ if (bfq_bfqq_injectable(bfqq) && -+ bfqq->injected_service * bfqq->inject_coeff < -+ bfqq->entity.service * 10) { -+ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); -+ bfqq = bfq_choose_bfqq_for_injection(bfqd); -+ } else { -+ if (BFQQ_SEEKY(bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "injection saturated %d * %d >= %d * 10", -+ bfqq->injected_service, bfqq->inject_coeff, -+ bfqq->entity.service); -+ bfqq = NULL; -+ } -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ else -+ bfq_log(bfqd, "no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "too much service"); -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch next request from bfqq. -+ */ -+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_remove(bfqd->queue, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->dispatched); -+ -+ if (bfqq != bfqd->in_service_queue) { -+ if (likely(bfqd->in_service_queue)) { -+ bfqd->in_service_queue->injected_service += -+ bfq_serv_to_charge(rq, bfqq); -+ bfq_log_bfqq(bfqd, bfqd->in_service_queue, -+ "injected_service increased to %d", -+ bfqd->in_service_queue->injected_service); -+ } -+ goto return_rq; -+ } -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ /* -+ * Expire bfqq, pretending that its budget expired, if bfqq -+ * belongs to CLASS_IDLE and other queues are waiting for -+ * service. -+ */ -+ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) -+ goto return_rq; -+ -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ -+return_rq: -+ return rq; -+} -+ -+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ -+ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", -+ !list_empty_careful(&bfqd->dispatch), bfq_tot_busy_queues(bfqd) > 0); -+ -+ /* -+ * Avoiding lock: a race on bfqd->busy_queues should cause at -+ * most a call to dispatch for nothing -+ */ -+ return !list_empty_careful(&bfqd->dispatch) || -+ bfq_tot_busy_queues(bfqd) > 0; -+} -+ -+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq = NULL; -+ struct bfq_queue *bfqq = NULL; -+ -+ if (!list_empty(&bfqd->dispatch)) { -+ rq = list_first_entry(&bfqd->dispatch, struct request, -+ queuelist); -+ list_del_init(&rq->queuelist); -+ rq->rq_flags &= ~RQF_DISP_LIST; -+ -+ bfq_log(bfqd, -+ "picked %p from dispatch list", rq); -+ bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ /* -+ * Increment counters here, because this -+ * dispatch does not follow the standard -+ * dispatch flow (where counters are -+ * incremented) -+ */ -+ bfqq->dispatched++; -+ -+ /* -+ * TESTING: reset DISP_LIST flag, because: 1) -+ * this rq this request has passed through -+ * bfq_prepare_request, 2) then it will have -+ * bfq_finish_requeue_request invoked on it, and 3) in -+ * bfq_finish_requeue_request we use this flag to check -+ * that bfq_finish_requeue_request is not invoked on -+ * requests for which bfq_prepare_request has -+ * been invoked. -+ */ -+ rq->rq_flags &= ~RQF_DISP_LIST; -+ goto inc_in_driver_start_rq; -+ } -+ -+ /* -+ * We exploit the bfq_finish_requeue_request hook to decrement -+ * rq_in_driver, but bfq_finish_requeue_request will not be -+ * invoked on this request. So, to avoid unbalance, -+ * just start this request, without incrementing -+ * rq_in_driver. As a negative consequence, -+ * rq_in_driver is deceptively lower than it should be -+ * while this request is in service. This may cause -+ * bfq_schedule_dispatch to be invoked uselessly. -+ * -+ * As for implementing an exact solution, the -+ * bfq_finish_requeue_request hook, if defined, is probably -+ * invoked also on this request. So, by exploiting -+ * this hook, we could 1) increment rq_in_driver here, -+ * and 2) decrement it in bfq_finish_requeue_request. Such a -+ * solution would let the value of the counter be -+ * always accurate, but it would entail using an extra -+ * interface function. This cost seems higher than the -+ * benefit, being the frequency of non-elevator-private -+ * requests very low. -+ */ -+ goto start_rq; -+ } -+ -+ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); -+ -+ if (bfq_tot_busy_queues(bfqd) == 0) -+ goto exit; -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ goto exit; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ goto exit; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfq_bfqq_wait_request(bfqq)); -+ -+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (rq) { -+ inc_in_driver_start_rq: -+ bfqd->rq_in_driver++; -+ start_rq: -+ rq->rq_flags |= RQF_STARTED; -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "%s request %p, rq_in_driver %d", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async", -+ rq, -+ bfqd->rq_in_driver); -+ else -+ bfq_log(bfqd, -+ "request %p from dispatch list, rq_in_driver %d", -+ rq, bfqd->rq_in_driver); -+ } else -+ bfq_log(bfqd, -+ "returned NULL request, rq_in_driver %d", -+ bfqd->rq_in_driver); -+ -+exit: -+ return rq; -+} -+ -+ -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) -+{ -+ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -+ -+ if (!idle_timer_disabled && !bfqq) -+ return; -+ -+ /* -+ * rq and bfqq are guaranteed to exist until this function -+ * ends, for the following reasons. First, rq can be -+ * dispatched to the device, and then can be completed and -+ * freed, only after this function ends. Second, rq cannot be -+ * merged (and thus freed because of a merge) any longer, -+ * because it has already started. Thus rq cannot be freed -+ * before this function ends, and, since rq has a reference to -+ * bfqq, the same guarantee holds for bfqq too. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ if (idle_timer_disabled) -+ /* -+ * Since the idle timer has been disabled, -+ * in_serv_queue contained some request when -+ * __bfq_dispatch_request was invoked above, which -+ * implies that rq was picked exactly from -+ * in_serv_queue. Thus in_serv_queue == bfqq, and is -+ * therefore guaranteed to exist because of the above -+ * arguments. -+ */ -+ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); -+ if (bfqq) { -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+ -+ bfqg_stats_update_avg_queue_size(bfqg); -+ bfqg_stats_set_start_empty_time(bfqg); -+ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); -+ } -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) {} -+#endif -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ struct bfq_queue *in_serv_queue; -+ bool waiting_rq, idle_timer_disabled; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ -+ rq = __bfq_dispatch_request(hctx); -+ -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, -+ idle_timer_disabled); -+ -+ return rq; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Scheduler lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ assert_spin_locked(&bfqq->bfqd->lock); -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); -+ -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { -+ hlist_del_init(&bfqq->burst_list_node); -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; -+ } -+ -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); -+ bfqg_and_blkg_put(bfqg); -+#endif -+ kmem_cache_free(bfq_pool, bfqq); -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ struct bfq_data *bfqd; -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ -+ -+ if (bfqq && bfqd) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_exit_bfqq(bfqd, bfqq); -+ bic_set_bfqq(bic, NULL, is_sync); -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ BUG_ON(!bic); -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ WARN_ON(!bfqd); -+ if (!bfqd) -+ return; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ -+ if (!bfq_class_idle(bfqq)) -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ /* -+ * Aggressively inject a lot of service: up to 90%. -+ * This coefficient remains constant during bfqq life, -+ * but this behavior might be changed, after enough -+ * testing and tuning. -+ */ -+ bfqq->inject_coeff = 1; -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ -+ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+ -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. -+ */ -+ bfqq->soft_rt_next_start = jiffies; -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_ttime *ttime = &bfqq->ttime; -+ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); -+} -+ -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ bool has_short_ttime = true; -+ -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ (bfq_sample_valid(bfqq->ttime.ttime_samples) && -+ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", -+ has_short_ttime); -+ -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bfqq); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if -+ * - the request is small, and -+ * - we are idling to boost throughput, and -+ * - the queue is not to be expired, -+ * then just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. In contrast -+ * we wait for the block layer to decide when to -+ * unplug the device: hopefully, new requests will be -+ * merged to this one quickly, then the device will be -+ * unplugged and larger requests will be dispatched. -+ */ -+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && -+ !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or idling is being -+ * performed to preserve service guarantees, or -+ * finally the queue is to be expired: in all these -+ * cases disk idling is to be stopped, so clear -+ * wait_request flag and reset timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ } -+} -+ -+/* returns true if it causes the idle timer to be disabled */ -+static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ bool waiting, idle_timer_disabled = false; -+ BUG_ON(!bfqq); -+ -+ assert_spin_locked(&bfqd->lock); -+ -+ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ BUG_ON(bic_to_bfqq(RQ_BIC(rq), 1) != bfqq); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated++; -+ bfqq->allocated--; -+ bfq_log_bfqq(bfqd, bfqq, -+ "new allocated %d", bfqq->allocated); -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "new_bfqq new allocated %d", -+ bfqq->allocated); -+ -+ new_bfqq->ref++; -+ /* -+ * If the bic associated with the process -+ * issuing this request still points to bfqq -+ * (and thus has not been already redirected -+ * to new_bfqq or even some other bfq_queue), -+ * then complete the merge and redirect it to -+ * new_bfqq. -+ */ -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ waiting = bfqq && bfq_bfqq_wait_request(bfqq); -+ bfq_add_request(rq); -+ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+ -+ return idle_timer_disabled; -+} -+ -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) -+{ -+ if (!bfqq) -+ return; -+ -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instructions. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) {} -+#endif -+ -+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -+ bool at_head) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ bool idle_timer_disabled = false; -+ unsigned int cmd_flags; -+ -+ spin_lock_irq(&bfqd->lock); -+ if (blk_mq_sched_try_insert_merge(q, rq)) { -+ spin_unlock_irq(&bfqd->lock); -+ return; -+ } -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ blk_mq_sched_request_inserted(rq); -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ bfqq = bfq_init_rq(rq); -+ BUG_ON(!bfqq && !(at_head || blk_rq_is_passthrough(rq))); -+ BUG_ON(bfqq && bic_to_bfqq(RQ_BIC(rq), rq_is_sync(rq)) != bfqq); -+ -+ if (at_head || blk_rq_is_passthrough(rq)) { -+ if (at_head) -+ list_add(&rq->queuelist, &bfqd->dispatch); -+ else -+ list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ rq->rq_flags |= RQF_DISP_LIST; -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "%p in disp: at_head %d", -+ rq, at_head); -+ else -+ bfq_log(bfqd, -+ "%p in disp: at_head %d", -+ rq, at_head); -+ } else { /* bfqq is assumed to be non null here */ -+ BUG_ON(!bfqq); -+ BUG_ON(!(rq->rq_flags & RQF_GOT)); -+ rq->rq_flags &= ~RQF_GOT; -+ -+ idle_timer_disabled = __bfq_insert_request(bfqd, rq); -+ /* -+ * Update bfqq, because, if a queue merge has occurred -+ * in __bfq_insert_request, then rq has been -+ * redirected into a new queue. -+ */ -+ bfqq = RQ_BFQQ(rq); -+ -+ if (rq_mergeable(rq)) { -+ elv_rqhash_add(q, rq); -+ if (!q->last_merge) -+ q->last_merge = rq; -+ } -+ } -+ -+ /* -+ * Cache cmd_flags before releasing scheduler lock, because rq -+ * may disappear afterwards (for example, because of a request -+ * merge). -+ */ -+ cmd_flags = rq->cmd_flags; -+ -+ spin_unlock_irq(&bfqd->lock); -+ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, -+ cmd_flags); -+} -+ -+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -+ struct list_head *list, bool at_head) -+{ -+ while (!list_empty(list)) { -+ struct request *rq; -+ -+ rq = list_first_entry(list, struct request, queuelist); -+ list_del_init(&rq->queuelist); -+ bfq_insert_request(hctx, rq, at_head); -+ } -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ /* -+ * If active queue hasn't enough requests and can idle, bfq might not -+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this -+ * case -+ */ -+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && -+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < -+ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) -+{ -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "new disp %d, new rq_in_driver %d", -+ bfqq->dispatched, bfqd->rq_in_driver); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, bfqq); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ bfqq->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * do not compute soft_rt_next_start if bfqq is in interactive -+ * weight raising (see the comments in bfq_bfqq_expire() for -+ * an explanation). We schedule this delayed update when bfqq -+ * expires, if it still has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfq_bfqq_must_idle(bfqq)) { -+ if (bfqq->dispatched == 0) -+ bfq_arm_slice_timer(bfqd); -+ /* -+ * If we get here, we do not expire bfqq, even -+ * if bfqq was in budget timeout or had no -+ * more requests (as controlled in the next -+ * conditional instructions). The reason for -+ * not expiring bfqq is as follows. -+ * -+ * Here bfqq->dispatched > 0 holds, but -+ * bfq_bfqq_must_idle() returned true. This -+ * implies that, even if no request arrives -+ * for bfqq before bfqq->dispatched reaches 0, -+ * bfqq will, however, not be expired on the -+ * completion event that causes bfqq->dispatch -+ * to reach zero. In contrast, on this event, -+ * bfqq will start enjoying device idling -+ * (I/O-dispatch plugging). -+ * -+ * But, if we expired bfqq here, bfqq would -+ * not have the chance to enjoy device idling -+ * when bfqq->dispatched finally reaches -+ * zero. This would expose bfqq to violation -+ * of its reserved service guarantees. -+ */ -+ return; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_better_to_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+} -+ -+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "allocated %d", bfqq->allocated); -+ BUG_ON(!bfqq->allocated); -+ bfqq->allocated--; -+ -+ bfq_put_queue(bfqq); -+} -+ -+/* -+ * Handle either a requeue or a finish for rq. The things to do are -+ * the same in both cases: all references to rq are to be dropped. In -+ * particular, rq is considered completed from the point of view of -+ * the scheduler. -+ */ -+static void bfq_finish_requeue_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd; -+ struct bfq_io_cq *bic; -+ -+ BUG_ON(!rq); -+ -+ bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * Requeue and finish hooks are invoked in blk-mq without -+ * checking whether the involved request is actually still -+ * referenced in the scheduler. To handle this fact, the -+ * following two checks make this function exit in case of -+ * spurious invocations, for which there is nothing to do. -+ * -+ * First, check whether rq has nothing to do with an elevator. -+ */ -+ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) -+ return; -+ -+ /* -+ * rq either is not associated with any icq, or is an already -+ * requeued request that has not (yet) been re-inserted into -+ * a bfq_queue. -+ */ -+ if (!rq->elv.icq || !bfqq) -+ return; -+ -+ bic = RQ_BIC(rq); -+ BUG_ON(!bic); -+ -+ bfqd = bfqq->bfqd; -+ BUG_ON(!bfqd); -+ -+ if (rq->rq_flags & RQF_DISP_LIST) { -+ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); -+ BUG(); -+ } -+ BUG_ON(rq->rq_flags & RQF_QUEUED); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "putting rq %p with %u sects left, STARTED %d", -+ rq, blk_rq_sectors(rq), -+ rq->rq_flags & RQF_STARTED); -+ -+ if (rq->rq_flags & RQF_STARTED) -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq->start_time_ns, -+ rq->io_start_time_ns, -+ rq->cmd_flags); -+ -+ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ -+ if (likely(rq->rq_flags & RQF_STARTED)) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ -+ bfq_completed_request(bfqq, bfqd); -+ bfq_finish_requeue_request_body(bfqq); -+ -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } else { -+ /* -+ * Request rq may be still/already in the scheduler, -+ * in which case we need to remove it (this should -+ * never happen in case of requeue). And we cannot -+ * defer such a check and removal, to avoid -+ * inconsistencies in the time interval from the end -+ * of this function to the start of the deferred work. -+ * This situation seems to occur only in process -+ * context, as a consequence of a merge. In the -+ * current version of the code, this implies that the -+ * lock is held. -+ */ -+ BUG_ON(in_interrupt()); -+ -+ assert_spin_locked(&bfqd->lock); -+ if (!RB_EMPTY_NODE(&rq->rb_node)) { -+ bfq_remove_request(rq->q, rq); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), -+ rq->cmd_flags); -+ } -+ bfq_finish_requeue_request_body(bfqq); -+ } -+ -+ /* -+ * Reset private fields. In case of a requeue, this allows -+ * this function to correctly do nothing if it is spuriously -+ * invoked again on this same request (see the check at the -+ * beginning of the function). Probably, a better general -+ * design would be to prevent blk-mq from invoking the requeue -+ * or finish hooks of an elevator, for a request that is not -+ * referred by that elevator. -+ * -+ * Resetting the following fields would break the -+ * request-insertion logic if rq is re-inserted into a bfq -+ * internal queue, without a re-preparation. Here we assume -+ * that re-insertions of requeued requests, without -+ * re-preparation, can happen only for pass_through or at_head -+ * requests (which are not re-inserted into bfq internal -+ * queues). -+ */ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct bio *bio, -+ bool split, bool is_sync, -+ bool *new_queue) -+{ -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ -+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) -+ return bfqq; -+ -+ if (new_queue) -+ *new_queue = true; -+ -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ -+ return bfqq; -+} -+ -+/* -+ * Only reset private fields. The actual request preparation will be -+ * performed by bfq_init_rq, when rq is either inserted or merged. See -+ * comments on bfq_init_rq for the reason behind this delayed -+ * preparation. -+*/ -+static void bfq_prepare_request(struct request *rq, struct bio *bio) -+{ -+ /* -+ * Regardless of whether we have an icq attached, we have to -+ * clear the scheduler pointers, as they might point to -+ * previously allocated bic/bfqq structs. -+ */ -+ rq->elv.priv[0] = rq->elv.priv[1] = NULL; -+} -+ -+/* -+ * If needed, init rq, allocate bfq data structures associated with -+ * rq, and increment reference counters in the destination bfq_queue -+ * for rq. Return the destination bfq_queue for rq, or NULL is rq is -+ * not associated with any bfq_queue. -+ * -+ * This function is invoked by the functions that perform rq insertion -+ * or merging. One may have expected the above preparation operations -+ * to be performed in bfq_prepare_request, and not delayed to when rq -+ * is inserted or merged. The rationale behind this delayed -+ * preparation is that, after the prepare_request hook is invoked for -+ * rq, rq may still be transformed into a request with no icq, i.e., a -+ * request not associated with any queue. No bfq hook is invoked to -+ * signal this tranformation. As a consequence, should these -+ * preparation operations be performed when the prepare_request hook -+ * is invoked, and should rq be transformed one moment later, bfq -+ * would end up in an inconsistent state, because it would have -+ * incremented some queue counters for an rq destined to -+ * transformation, without any chance to correctly lower these -+ * counters back. In contrast, no transformation can still happen for -+ * rq after rq has been inserted or merged. So, it is safe to execute -+ * these preparation operations when rq is finally inserted or merged. -+ */ -+static struct bfq_queue *bfq_init_rq(struct request *rq) -+{ -+ struct request_queue *q = rq->q; -+ struct bio *bio = rq->bio; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic; -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ bool bfqq_already_existing = false, split = false; -+ bool new_queue = false; -+ -+ if (unlikely(!rq->elv.icq)) -+ return NULL; -+ -+ /* -+ * Assuming that elv.priv[1] is set only if everything is set -+ * for this rq. This holds true, because this function is -+ * invoked only for insertion or merging, and, after such -+ * events, a request cannot be manipulated any longer before -+ * being removed from bfq. -+ */ -+ if (rq->elv.priv[1]) { -+ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); -+ return rq->elv.priv[1]; -+ } -+ -+ bic = icq_to_bic(rq->elv.icq); -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, -+ &new_queue); -+ -+ if (likely(!new_queue)) { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ BUG_ON(!is_sync); -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ -+ if (!bfqq) -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -+ true, is_sync, -+ NULL); -+ else -+ bfqq_already_existing = true; -+ -+ BUG_ON(!bfqq); -+ BUG_ON(bfqq == &bfqd->oom_bfqq); -+ } -+ } -+ -+ bfqq->allocated++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new allocated %d", bfqq->allocated); -+ -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ rq->rq_flags &= ~RQF_DISP_LIST; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only this bic: we can then set bfqq->bic = bic. in -+ * addition, if the queue has also just been split, we have to -+ * resume its state. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * The queue has just been split from a shared -+ * queue: restore the idle window and the -+ * possible weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ rq->rq_flags |= RQF_GOT; -+ -+ return bfqq; -+} -+ -+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ enum bfqq_expiration reason; -+ unsigned long flags; -+ -+ BUG_ON(!bfqd); -+ spin_lock_irqsave(&bfqd->lock, flags); -+ -+ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfqq != bfqd->in_service_queue) { -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ return; -+ } -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ -+schedule_dispatch: -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_schedule_dispatch(bfqd); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ -+ bfq_log(bfqd, "expired"); -+ -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if a new request -+ * arrives for the current queue and there is a full dispatch -+ * cycle that changes the in-service queue. This can hardly -+ * happen, but in the worst case we just expire a queue too -+ * early. -+ */ -+ if (bfqq) -+ bfq_idle_slice_timer_body(bfqq); -+ -+ return HRTIMER_NORESTART; -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "%p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+/* -+ * See the comments on bfq_limit_depth for the purpose of -+ * the depths set in the function. Return minimum shallow depth we'll use. -+ */ -+static unsigned int bfq_update_depths(struct bfq_data *bfqd, -+ struct sbitmap_queue *bt) -+{ -+ unsigned int i, j, min_shallow = UINT_MAX; -+ -+ /* -+ * In-word depths if no bfq_queue is being weight-raised: -+ * leaving 25% of tags only for sync reads. -+ * -+ * In next formulas, right-shift the value -+ * (1U<<bt->sb.shift), instead of computing directly -+ * (1U<<(bt->sb.shift - something)), to be robust against -+ * any possible value of bt->sb.shift, without having to -+ * limit 'something'. -+ */ -+ /* no more than 50% of tags for async I/O */ -+ bfqd->word_depths[0][0] = max((1U<<bt->sb.shift)>>1, 1U); -+ /* -+ * no more than 75% of tags for sync writes (25% extra tags -+ * w.r.t. async I/O, to prevent async I/O from starving sync -+ * writes) -+ */ -+ bfqd->word_depths[0][1] = max(((1U<<bt->sb.shift) * 3)>>2, 1U); -+ -+ /* -+ * In-word depths in case some bfq_queue is being weight- -+ * raised: leaving ~63% of tags for sync reads. This is the -+ * highest percentage for which, in our tests, application -+ * start-up times didn't suffer from any regression due to tag -+ * shortage. -+ */ -+ /* no more than ~18% of tags for async I/O */ -+ bfqd->word_depths[1][0] = max(((1U<<bt->sb.shift) * 3)>>4, 1U); -+ /* no more than ~37% of tags for sync writes (~20% extra tags) */ -+ bfqd->word_depths[1][1] = max(((1U<<bt->sb.shift) * 6)>>4, 1U); -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < 2; j++) -+ min_shallow = min(min_shallow, bfqd->word_depths[i][j]); -+ -+ return min_shallow; -+} -+ -+static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct blk_mq_tags *tags = hctx->sched_tags; -+ unsigned int min_shallow; -+ -+ min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); -+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); -+} -+ -+static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) -+{ -+ bfq_depth_updated(hctx); -+ return 0; -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_log(bfqd, "starting ..."); -+ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ -+ BUG_ON(bfqd->in_service_queue); -+ BUG_ON(!list_empty(&bfqd->active_list)); -+ -+ spin_lock_irq(&bfqd->lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ spin_unlock_irq(&bfqd->lock); -+ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* release oom-queue reference to root group */ -+ bfqg_and_blkg_put(bfqd->root_group); -+ -+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); -+#else -+ spin_lock_irq(&bfqd->lock); -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+ spin_unlock_irq(&bfqd->lock); -+#endif -+ -+ bfq_log(bfqd, "finished ..."); -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ INIT_LIST_HEAD(&bfqd->dispatch); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->num_groups_with_pending_reqs = 0; -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device peak -+ * rate is equal to 2/3 of the highest reference rate. -+ */ -+ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * -+ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ -+ spin_lock_init(&bfqd->lock); -+ -+ /* -+ * The invocation of the next bfq_create_group_hierarchy -+ * function is the head of a chain of function calls -+ * (bfq_create_group_hierarchy->blkcg_activate_policy-> -+ * blk_mq_freeze_queue) that may lead to the invocation of the -+ * has_work hook function. For this reason, -+ * bfq_create_group_hierarchy is invoked only after all -+ * scheduler data has been initialized, apart from the fields -+ * that can be initialized only after invoking -+ * bfq_create_group_hierarchy. This, in particular, enables -+ * has_work to correctly return false. Of course, to avoid -+ * other inconsistencies, the blk-mq stack must then refrain -+ * from invoking further scheduler hooks before this init -+ * function is finished. -+ */ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ wbt_disable_default(q); -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq_mq = { -+ .ops.mq = { -+ .limit_depth = bfq_limit_depth, -+ .prepare_request = bfq_prepare_request, -+ .requeue_request = bfq_finish_requeue_request, -+ .finish_request = bfq_finish_requeue_request, -+ .exit_icq = bfq_exit_icq, -+ .insert_requests = bfq_insert_requests, -+ .dispatch_request = bfq_dispatch_request, -+ .next_request = elv_rb_latter_request, -+ .former_request = elv_rb_former_request, -+ .allow_merge = bfq_allow_bio_merge, -+ .bio_merge = bfq_bio_merge, -+ .request_merge = bfq_request_merge, -+ .requests_merged = bfq_requests_merged, -+ .request_merged = bfq_request_merged, -+ .has_work = bfq_has_work, -+ .depth_updated = bfq_depth_updated, -+ .init_hctx = bfq_init_hctx, -+ .init_sched = bfq_init_queue, -+ .exit_sched = bfq_exit_queue, -+ }, -+ -+ .uses_mq = true, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-mq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v9"; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definition of the next -+ * array). Actually, we use slightly lower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ ret = elv_register(&iosched_bfq_mq); -+ if (ret) -+ goto slab_kill; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+slab_kill: -+ bfq_slab_kill(); -+err_pol_unreg: -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq_mq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Paolo Valente"); -+MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -new file mode 100644 -index 000000000000..ceb291132a1a ---- /dev/null -+++ b/block/bfq-mq.h -@@ -0,0 +1,1077 @@ -+/* -+ * BFQ v9: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ -+#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue in a hierarchical setup. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active queues -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the queues this counter refers to */ -+ unsigned int num_active; /* nr of active queues with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+ -+ /* flag, set if the entity is counted in groups_with_pending_reqs */ -+ bool in_groups_with_pending_reqs; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of requests currently allocated */ -+ int allocated; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* pointer to the weight counter associated with this queue */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ -+ -+ /* max service rate measured so far */ -+ u32 max_service_rate; -+ /* -+ * Ratio between the service received by bfqq while it is in -+ * service, and the cumulative service (of requests of other -+ * queues) that may be injected while bfqq is empty but still -+ * in service. To increase precision, the coefficient is -+ * measured in tenths of unit. Here are some example of (1) -+ * ratios, (2) resulting percentages of service injected -+ * w.r.t. to the total service dispatched while bfqq is in -+ * service, and (3) corresponding values of the coefficient: -+ * 1 (50%) -> 10 -+ * 2 (33%) -> 20 -+ * 10 (9%) -> 100 -+ * 9.9 (9%) -> 99 -+ * 1.5 (40%) -> 15 -+ * 0.5 (66%) -> 5 -+ * 0.1 (90%) -> 1 -+ * -+ * So, if the coefficient is lower than 10, then -+ * injected service is more than bfqq service. -+ */ -+ unsigned int inject_coeff; -+ /* amount of service injected in current service slot */ -+ unsigned int injected_service; -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. -+ */ -+ bool saved_has_short_ttime; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+ struct bfq_ttime saved_ttime; -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by @lock. -+ */ -+struct bfq_data { -+ /* device request queue */ -+ struct request_queue *queue; -+ /* dispatch queue */ -+ struct list_head dispatch; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ -+ /* -+ * Number of groups with at least one descendant process that -+ * has at least one request waiting for completion. Note that -+ * this accounts for also requests already dispatched, but not -+ * yet completed. Therefore this number of groups may differ -+ * (be larger) than the number of active groups, as a group is -+ * considered active only if its corresponding entity has -+ * descendant queues with at least one request queued. This -+ * number is used to decide whether a scenario is symmetric. -+ * For a detailed explanation see comments on the computation -+ * of the variable asymmetric_scenario in the function -+ * bfq_better_to_idle(). -+ * -+ * However, it is hard to compute this number exactly, for -+ * groups with multiple descendant processes. Consider a group -+ * that is inactive, i.e., that has no descendant process with -+ * pending I/O inside BFQ queues. Then suppose that -+ * num_groups_with_pending_reqs is still accounting for this -+ * group, because the group has descendant processes with some -+ * I/O request still in flight. num_groups_with_pending_reqs -+ * should be decremented when the in-flight request of the -+ * last descendant process is finally completed (assuming that -+ * nothing else has changed for the group in the meantime, in -+ * terms of composition of the group and active/inactive state of child -+ * groups and processes). To accomplish this, an additional -+ * pending-request counter must be added to entities, and must -+ * be updated correctly. To avoid this additional field and operations, -+ * we resort to the following tradeoff between simplicity and -+ * accuracy: for an inactive group that is still counted in -+ * num_groups_with_pending_reqs, we decrement -+ * num_groups_with_pending_reqs when the first descendant -+ * process of the group remains with no request waiting for -+ * completion. -+ * -+ * Even this simpler decrement strategy requires a little -+ * carefulness: to avoid multiple decrements, we flag a group, -+ * more precisely an entity representing a group, as still -+ * counted in num_groups_with_pending_reqs when it becomes -+ * inactive. Then, when the first descendant queue of the -+ * entity remains with no request waiting for completion, -+ * num_groups_with_pending_reqs is decremented, and this flag -+ * is reset. After this flag is reset for the entity, -+ * num_groups_with_pending_reqs won't be decremented any -+ * longer in case a new descendant queue of the entity remains -+ * with no request waiting for completion. -+ */ -+ unsigned int num_groups_with_pending_reqs; -+ -+ /* -+ * Per-class (RT, BE, IDLE) number of bfq_queues containing -+ * requests (including the queue in service, even if it is -+ * idling). -+ */ -+ unsigned int busy_queues[3]; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* position of the last served request for the in-service queue */ -+ sector_t in_serv_last_pos; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product ref_rate*ref_wr_duration, used -+ * for computing the maximum duration of weight raising -+ * automatically. -+ */ -+ u64 rate_dur_prod; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+ -+ spinlock_t lock; -+ -+ /* -+ * bic associated with the task issuing current bio for -+ * merging. This and the next field are used as a support to -+ * be able to perform the bic lookup, needed by bio-merge -+ * functions, before the scheduler lock is taken, and thus -+ * avoid taking the request-queue lock while the scheduler -+ * lock is being held. -+ */ -+ struct bfq_io_cq *bio_bic; -+ /* bfqq associated with the task issuing current bio for merging */ -+ struct bfq_queue *bio_bfqq; -+ /* Extra flag used only for TESTING */ -+ bool bio_bfqq_set; -+ -+ /* -+ * Depth limits used in bfq_limit_depth (see comments on the -+ * function) -+ */ -+ unsigned int word_depths[2][2]; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(has_short_ttime); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ pr_crit("%s %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ bfqg->blkg_path, __func__, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __func__, ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ -+ __func__, ##args);\ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ u64 start_group_wait_time; -+ u64 start_idle_time; -+ u64 start_empty_time; -+ uint16_t flags; -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ -+ char blkg_path[128]; -+ -+ /* reference counter (see comments in bfq_bic_update_cgroup) */ -+ int ref; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) -+{ -+ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + -+ bfqd->busy_queues[2]; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -new file mode 100644 -index 000000000000..7a4923231106 ---- /dev/null -+++ b/block/bfq-sched.c -@@ -0,0 +1,2077 @@ -+/* -+ * BFQ: Hierarchical B-WF2Q+ scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -+{ -+ struct rb_node *node = tree->rb_node; -+ -+ return rb_entry(node, struct bfq_entity, rb_node); -+} -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration); -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); -+ -+/** -+ * bfq_update_next_in_service - update sd->next_in_service -+ * @sd: sched_data for which to perform the update. -+ * @new_entity: if not NULL, pointer to the entity whose activation, -+ * requeueing or repositionig triggered the invocation of -+ * this function. -+ * @expiration: id true, this function is being invoked after the -+ * expiration of the in-service entity -+ * -+ * This function is called to update sd->next_in_service, which, in -+ * its turn, may change as a consequence of the insertion or -+ * extraction of an entity into/from one of the active trees of -+ * sd. These insertions/extractions occur as a consequence of -+ * activations/deactivations of entities, with some activations being -+ * 'true' activations, and other activations being requeueings (i.e., -+ * implementing the second, requeueing phase of the mechanism used to -+ * reposition an entity in its active tree; see comments on -+ * __bfq_activate_entity and __bfq_requeue_entity for details). In -+ * both the last two activation sub-cases, new_entity points to the -+ * just activated or requeued entity. -+ * -+ * Returns true if sd->next_in_service changes in such a way that -+ * entity->parent may become the next_in_service for its parent -+ * entity. -+ */ -+static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *new_entity, -+ bool expiration) -+{ -+ struct bfq_entity *next_in_service = sd->next_in_service; -+ struct bfq_queue *bfqq; -+ bool parent_sched_may_change = false; -+ bool change_without_lookup = false; -+ -+ /* -+ * If this update is triggered by the activation, requeueing -+ * or repositiong of an entity that does not coincide with -+ * sd->next_in_service, then a full lookup in the active tree -+ * can be avoided. In fact, it is enough to check whether the -+ * just-modified entity has the same priority as -+ * sd->next_in_service, is eligible and has a lower virtual -+ * finish time than sd->next_in_service. If this compound -+ * condition holds, then the new entity becomes the new -+ * next_in_service. Otherwise no change is needed. -+ */ -+ if (new_entity && new_entity != sd->next_in_service) { -+ /* -+ * Flag used to decide whether to replace -+ * sd->next_in_service with new_entity. Tentatively -+ * set to true, and left as true if -+ * sd->next_in_service is NULL. -+ */ -+ change_without_lookup = true; -+ -+ /* -+ * If there is already a next_in_service candidate -+ * entity, then compare timestamps to decide whether -+ * to replace sd->service_tree with new_entity. -+ */ -+ if (next_in_service) { -+ unsigned int new_entity_class_idx = -+ bfq_class_idx(new_entity); -+ struct bfq_service_tree *st = -+ sd->service_tree + new_entity_class_idx; -+ -+ change_without_lookup = -+ (new_entity_class_idx == -+ bfq_class_idx(next_in_service) -+ && -+ !bfq_gt(new_entity->start, st->vtime) -+ && -+ bfq_gt(next_in_service->finish, -+ new_entity->finish)); -+ } -+ -+ if (change_without_lookup) { -+ next_in_service = new_entity; -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "chose without lookup"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -+ "chose without lookup"); -+ } -+#endif -+ } -+ } -+ -+ if (!change_without_lookup) /* lookup needed */ -+ next_in_service = bfq_lookup_next_entity(sd, expiration); -+ -+ if (next_in_service) { -+ bool new_budget_triggers_change = -+ bfq_update_parent_budget(next_in_service); -+ -+ parent_sched_may_change = !sd->next_in_service || -+ new_budget_triggers_change; -+ } -+ -+ sd->next_in_service = next_in_service; -+ -+ if (!next_in_service) -+ return parent_sched_may_change; -+ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "chosen this queue"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen this entity"); -+ } -+#endif -+ return parent_sched_may_change; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* both next loops stop at one of the child entities of the root group */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+/* -+ * For each iteration, compute parent in advance, so as to be safe if -+ * entity is deallocated during the iteration. Such a deallocation may -+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue -+ * containing entity. -+ */ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+/* -+ * Returns true if this budget changes may let next_in_service->parent -+ * become the next_in_service entity for its parent entity. -+ */ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ struct bfq_entity *bfqg_entity; -+ struct bfq_group *bfqg; -+ struct bfq_sched_data *group_sd; -+ bool ret = false; -+ -+ BUG_ON(!next_in_service); -+ -+ group_sd = next_in_service->sched_data; -+ -+ bfqg = container_of(group_sd, struct bfq_group, sched_data); -+ /* -+ * bfq_group's my_entity field is not NULL only if the group -+ * is not the root group. We must not touch the root entity -+ * as it must never become an in-service entity. -+ */ -+ bfqg_entity = bfqg->my_entity; -+ if (bfqg_entity) { -+ if (bfqg_entity->budget > next_in_service->budget) -+ ret = true; -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "old budg: %d, new budg: %d", -+ bfqg_entity->budget, next_in_service->budget); -+ bfqg_entity->budget = next_in_service->budget; -+ } -+ -+ return ret; -+} -+ -+/* -+ * This function tells whether entity stops being a candidate for next -+ * service, according to the restrictive definition of the field -+ * next_in_service. In particular, this function is invoked for an -+ * entity that is about to be set in service. -+ * -+ * If entity is a queue, then the entity is no longer a candidate for -+ * next service according to the that definition, because entity is -+ * about to become the in-service queue. This function then returns -+ * true if entity is a queue. -+ * -+ * In contrast, entity could still be a candidate for next service if -+ * it is not a queue, and has more than one active child. In fact, -+ * even if one of its children is about to be set in service, other -+ * active children may still be the next to serve, for the parent -+ * entity, even according to the above definition. As a consequence, a -+ * non-queue entity is not a candidate for next-service only if it has -+ * only one active child. And only if this condition holds, then this -+ * function returns true for a non-queue entity. -+ */ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ struct bfq_group *bfqg; -+ -+ if (bfq_entity_to_bfqq(entity)) -+ return true; -+ -+ bfqg = container_of(entity, struct bfq_group, entity); -+ -+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); -+ BUG_ON(bfqg->active_entities == 0); -+ /* -+ * The field active_entities does not always contain the -+ * actual number of active children entities: it happens to -+ * not account for the in-service entity in case the latter is -+ * removed from its active tree (which may get done after -+ * invoking the function bfq_no_longer_next_in_service in -+ * bfq_get_next_queue). Fortunately, here, i.e., while -+ * bfq_no_longer_next_in_service is not yet completed in -+ * bfq_get_next_queue, bfq_active_extract has not yet been -+ * invoked, and thus active_entities still coincides with the -+ * actual number of active entities. -+ */ -+ if (bfqg->active_entities == 1) -+ return true; -+ -+ return false; -+} -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = NULL) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (parent = NULL; entity ; entity = parent) -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ return false; -+} -+ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ return true; -+} -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+/* -+ * Shift for timestamp calculations. This actually limits the maximum -+ * service allowed in one timestamp delta (small shift values increase it), -+ * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time -+ * wraparounds. -+ */ -+#define WFQ_SERVICE_SHIFT 22 -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = NULL; -+ -+ BUG_ON(!entity); -+ -+ if (!entity->my_sched_data) -+ bfqq = container_of(entity, struct bfq_queue, entity); -+ -+ return bfqq; -+} -+ -+ -+/** -+ * bfq_delta - map service into the virtual time domain. -+ * @service: amount of service. -+ * @weight: scale factor (weight of an entity or weight sum). -+ */ -+static u64 bfq_delta(unsigned long service, unsigned long weight) -+{ -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; -+ -+ do_div(d, weight); -+ return d; -+} -+ -+/** -+ * bfq_calc_finish - assign the finish time to an entity. -+ * @entity: the entity to act upon. -+ * @service: the service to be charged to the entity. -+ */ -+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned long long start, finish, delta; -+ -+ BUG_ON(entity->weight == 0); -+ -+ entity->finish = entity->start + -+ bfq_delta(service, entity->weight); -+ -+ start = ((entity->start>>10)*1000)>>12; -+ finish = ((entity->finish>>10)*1000)>>12; -+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "group: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "group: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#endif -+ } -+} -+ -+/** -+ * bfq_entity_of - get an entity from a node. -+ * @node: the node field of the entity. -+ * -+ * Convert a node pointer to the relative entity. This is used only -+ * to simplify the logic of some functions and not as the generic -+ * conversion mechanism because, e.g., in the tree walking functions, -+ * the check for a %NULL value would be redundant. -+ */ -+static struct bfq_entity *bfq_entity_of(struct rb_node *node) -+{ -+ struct bfq_entity *entity = NULL; -+ -+ if (node) -+ entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ return entity; -+} -+ -+/** -+ * bfq_extract - remove an entity from a tree. -+ * @root: the tree root. -+ * @entity: the entity to remove. -+ */ -+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -+{ -+ BUG_ON(entity->tree != root); -+ -+ entity->tree = NULL; -+ rb_erase(&entity->rb_node, root); -+} -+ -+/** -+ * bfq_idle_extract - extract an entity from the idle tree. -+ * @st: the service tree of the owning @entity. -+ * @entity: the entity being removed. -+ */ -+static void bfq_idle_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *next; -+ -+ BUG_ON(entity->tree != &st->idle); -+ -+ if (entity == st->first_idle) { -+ next = rb_next(&entity->rb_node); -+ st->first_idle = bfq_entity_of(next); -+ } -+ -+ if (entity == st->last_idle) { -+ next = rb_prev(&entity->rb_node); -+ st->last_idle = bfq_entity_of(next); -+ } -+ -+ bfq_extract(&st->idle, entity); -+ -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+} -+ -+/** -+ * bfq_insert - generic tree insertion. -+ * @root: tree root. -+ * @entity: entity to insert. -+ * -+ * This is used for the idle and the active tree, since they are both -+ * ordered by finish time. -+ */ -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -+{ -+ struct bfq_entity *entry; -+ struct rb_node **node = &root->rb_node; -+ struct rb_node *parent = NULL; -+ -+ BUG_ON(entity->tree); -+ -+ while (*node) { -+ parent = *node; -+ entry = rb_entry(parent, struct bfq_entity, rb_node); -+ -+ if (bfq_gt(entry->finish, entity->finish)) -+ node = &parent->rb_left; -+ else -+ node = &parent->rb_right; -+ } -+ -+ rb_link_node(&entity->rb_node, parent, node); -+ rb_insert_color(&entity->rb_node, root); -+ -+ entity->tree = root; -+} -+ -+/** -+ * bfq_update_min - update the min_start field of a entity. -+ * @entity: the entity to update. -+ * @node: one of its children. -+ * -+ * This function is called when @entity may store an invalid value for -+ * min_start due to updates to the active tree. The function assumes -+ * that the subtree rooted at @node (which may be its left or its right -+ * child) has a valid min_start value. -+ */ -+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -+{ -+ struct bfq_entity *child; -+ -+ if (node) { -+ child = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entity->min_start, child->min_start)) -+ entity->min_start = child->min_start; -+ } -+} -+ -+/** -+ * bfq_update_active_node - recalculate min_start. -+ * @node: the node to update. -+ * -+ * @node may have changed position or one of its children may have moved, -+ * this function updates its min_start value. The left and right subtrees -+ * are assumed to hold a correct min_start value. -+ */ -+static void bfq_update_active_node(struct rb_node *node) -+{ -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->min_start = entity->start; -+ bfq_update_min(entity, node->rb_right); -+ bfq_update_min(entity, node->rb_left); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#endif -+ } -+} -+ -+/** -+ * bfq_update_active_tree - update min_start for the whole active tree. -+ * @node: the starting node. -+ * -+ * @node must be the deepest modified node after an update. This function -+ * updates its min_start using the values held by its children, assuming -+ * that they did not change, and then updates all the nodes that may have -+ * changed in the path to the root. The only nodes that may have changed -+ * are the ones in the path or their siblings. -+ */ -+static void bfq_update_active_tree(struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+up: -+ bfq_update_active_node(node); -+ -+ parent = rb_parent(node); -+ if (!parent) -+ return; -+ -+ if (node == parent->rb_left && parent->rb_right) -+ bfq_update_active_node(parent->rb_right); -+ else if (parent->rb_left) -+ bfq_update_active_node(parent->rb_left); -+ -+ node = parent; -+ goto up; -+} -+ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root); -+ -+static void __bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root); -+ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ -+ -+/** -+ * bfq_active_insert - insert an entity in the active tree of its -+ * group/device. -+ * @st: the service tree of the entity. -+ * @entity: the entity being inserted. -+ * -+ * The active tree is ordered by finish time, but an extra key is kept -+ * per each node, containing the minimum value for the start times of -+ * its children (and the node itself), so it's possible to search for -+ * the eligible node with the lowest finish time in logarithmic time. -+ */ -+static void bfq_active_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node = &entity->rb_node; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ bfq_insert(&st->active, entity); -+ -+ if (node->rb_left) -+ node = node->rb_left; -+ else if (node->rb_right) -+ node = node->rb_right; -+ -+ bfq_update_active_tree(node); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ bfqg->active_entities++; -+ } -+#endif -+} -+ -+/** -+ * bfq_ioprio_to_weight - calc a weight from an ioprio. -+ * @ioprio: the ioprio value to convert. -+ */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) -+{ -+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; -+} -+ -+/** -+ * bfq_weight_to_ioprio - calc an ioprio from a weight. -+ * @weight: the weight value to convert. -+ * -+ * To preserve as much as possible the old only-ioprio user interface, -+ * 0 is used as an escape ioprio value for weights (numerically) equal or -+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. -+ */ -+static unsigned short bfq_weight_to_ioprio(int weight) -+{ -+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? -+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -+} -+ -+static void bfq_get_entity(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ if (bfqq) { -+ bfqq->ref++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", -+ bfqq, bfqq->ref); -+ } -+} -+ -+/** -+ * bfq_find_deepest - find the deepest node that an extraction can modify. -+ * @node: the node being removed. -+ * -+ * Do the first step of an extraction in an rb tree, looking for the -+ * node that will replace @node, and returning the deepest node that -+ * the following modifications to the tree can touch. If @node is the -+ * last node in the tree return %NULL. -+ */ -+static struct rb_node *bfq_find_deepest(struct rb_node *node) -+{ -+ struct rb_node *deepest; -+ -+ if (!node->rb_right && !node->rb_left) -+ deepest = rb_parent(node); -+ else if (!node->rb_right) -+ deepest = node->rb_left; -+ else if (!node->rb_left) -+ deepest = node->rb_right; -+ else { -+ deepest = rb_next(node); -+ if (deepest->rb_right) -+ deepest = deepest->rb_right; -+ else if (rb_parent(deepest) != node) -+ deepest = rb_parent(deepest); -+ } -+ -+ return deepest; -+} -+ -+/** -+ * bfq_active_extract - remove an entity from the active tree. -+ * @st: the service_tree containing the tree. -+ * @entity: the entity being removed. -+ */ -+static void bfq_active_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ node = bfq_find_deepest(&entity->rb_node); -+ bfq_extract(&st->active, entity); -+ -+ if (node) -+ bfq_update_active_tree(node); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ BUG_ON(!bfqg->active_entities); -+ bfqg->active_entities--; -+ } -+#endif -+} -+ -+/** -+ * bfq_idle_insert - insert an entity into the idle tree. -+ * @st: the service tree containing the tree. -+ * @entity: the entity to insert. -+ */ -+static void bfq_idle_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) -+ st->first_idle = entity; -+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) -+ st->last_idle = entity; -+ -+ bfq_insert(&st->idle, entity); -+ -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -+} -+ -+/** -+ * bfq_forget_entity - do not consider entity any longer for scheduling -+ * @st: the service tree. -+ * @entity: the entity being removed. -+ * @is_in_service: true if entity is currently the in-service entity. -+ * -+ * Forget everything about @entity. In addition, if entity represents -+ * a queue, and the latter is not in service, then release the service -+ * reference to the queue (the one taken through bfq_get_entity). In -+ * fact, in this case, there is really no more service reference to -+ * the queue, as the latter is also outside any service tree. If, -+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service -+ * will take care of putting the reference when the queue finally -+ * stops being served. -+ */ -+static void bfq_forget_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity, -+ bool is_in_service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!entity->on_st); -+ -+ entity->on_st = false; -+ st->wsum -= entity->weight; -+ if (bfqq && !is_in_service) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/** -+ * bfq_put_idle_entity - release the idle tree ref of an entity. -+ * @st: service tree for the entity. -+ * @entity: the entity being released. -+ */ -+static void bfq_put_idle_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ bfq_idle_extract(st, entity); -+ bfq_forget_entity(st, entity, -+ entity == entity->sched_data->in_service_entity); -+} -+ -+/** -+ * bfq_forget_idle - update the idle tree if necessary. -+ * @st: the service tree to act upon. -+ * -+ * To preserve the global O(log N) complexity we only remove one entry here; -+ * as the idle tree will not grow indefinitely this can be done safely. -+ */ -+static void bfq_forget_idle(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (RB_EMPTY_ROOT(&st->active) && last_idle && -+ !bfq_gt(last_idle->finish, st->vtime)) { -+ /* -+ * Forget the whole idle tree, increasing the vtime past -+ * the last finish time of idle entities. -+ */ -+ st->vtime = last_idle->finish; -+ } -+ -+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) -+ bfq_put_idle_entity(st, first_idle); -+} -+ -+/* -+ * Update weight and priority of entity. If update_class_too is true, -+ * then update the ioprio_class of entity too. -+ * -+ * The reason why the update of ioprio_class is controlled through the -+ * last parameter is as follows. Changing the ioprio class of an -+ * entity implies changing the destination service trees for that -+ * entity. If such a change occurred when the entity is already on one -+ * of the service trees for its previous class, then the state of the -+ * entity would become more complex: none of the new possible service -+ * trees for the entity, according to bfq_entity_service_tree(), would -+ * match any of the possible service trees on which the entity -+ * is. Complex operations involving these trees, such as entity -+ * activations and deactivations, should take into account this -+ * additional complexity. To avoid this issue, this function is -+ * invoked with update_class_too unset in the points in the code where -+ * entity may happen to be on some tree. -+ */ -+static struct bfq_service_tree * -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, -+ struct bfq_entity *entity, -+ bool update_class_too) -+{ -+ struct bfq_service_tree *new_st = old_st; -+ -+ if (entity->prio_changed) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int prev_weight, new_weight; -+ struct bfq_data *bfqd = NULL; -+ struct rb_root *root; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_sched_data *sd; -+ struct bfq_group *bfqg; -+#endif -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ sd = entity->my_sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+ BUG_ON(!bfqd); -+ } -+#endif -+ -+ BUG_ON(entity->tree && update_class_too); -+ BUG_ON(old_st->wsum < entity->weight); -+ old_st->wsum -= entity->weight; -+ -+ if (entity->new_weight != entity->orig_weight) { -+ if (entity->new_weight < BFQ_MIN_WEIGHT || -+ entity->new_weight > BFQ_MAX_WEIGHT) { -+ pr_crit("update_weight_prio: new_weight %d\n", -+ entity->new_weight); -+ if (entity->new_weight < BFQ_MIN_WEIGHT) -+ entity->new_weight = BFQ_MIN_WEIGHT; -+ else -+ entity->new_weight = BFQ_MAX_WEIGHT; -+ } -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) -+ bfqq->ioprio = -+ bfq_weight_to_ioprio(entity->orig_weight); -+ } -+ -+ if (bfqq && update_class_too) -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ -+ /* -+ * Reset prio_changed only if the ioprio_class change -+ * is not pending any longer. -+ */ -+ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) -+ entity->prio_changed = 0; -+ -+ /* -+ * NOTE: here we may be changing the weight too early, -+ * this will cause unfairness. The correct approach -+ * would have required additional complexity to defer -+ * weight changes to the proper time instants (i.e., -+ * when entity->finish <= old_st->vtime). -+ */ -+ new_st = bfq_entity_service_tree(entity); -+ -+ prev_weight = entity->weight; -+ new_weight = entity->orig_weight * -+ (bfqq ? bfqq->wr_coeff : 1); -+ /* -+ * If the weight of the entity changes and the entity is a -+ * queue, remove the entity from its old weight counter (if -+ * there is a counter associated with the entity). -+ */ -+ if (prev_weight != new_weight && bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "weight changed %d %d(%d %d)", -+ prev_weight, new_weight, -+ entity->orig_weight, -+ bfqq->wr_coeff); -+ -+ root = &bfqd->queue_weights_tree; -+ __bfq_weights_tree_remove(bfqd, bfqq, root); -+ } -+ entity->weight = new_weight; -+ /* -+ * Add the entity, if it is not a weight-raised queue, to the -+ * counter associated with its new weight. -+ */ -+ if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { -+ /* If we get here, root has been initialized. */ -+ bfq_weights_tree_add(bfqd, bfqq, root); -+ } -+ -+ new_st->wsum += entity->weight; -+ -+ if (new_st != old_st) { -+ BUG_ON(!update_class_too); -+ entity->start = new_st->vtime; -+ } -+ } -+ -+ return new_st; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -+#endif -+ -+/** -+ * bfq_bfqq_served - update the scheduler status after selection for -+ * service. -+ * @bfqq: the queue being served. -+ * @served: bytes to transfer. -+ * -+ * NOTE: this can be optimized, as the timestamps of upper level entities -+ * are synchronized every time a new bfqq is selected for service. By now, -+ * we keep it to better check consistency. -+ */ -+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st; -+ -+ if (!bfqq->service_from_backlogged) -+ bfqq->first_IO_time = jiffies; -+ -+ if (bfqq->wr_coeff > 1) -+ bfqq->service_from_wr += served; -+ -+ bfqq->service_from_backlogged += served; -+ for_each_entity(entity) { -+ st = bfq_entity_service_tree(entity); -+ -+ entity->service += served; -+ -+ BUG_ON(st->wsum == 0); -+ -+ st->vtime += bfq_delta(served, st->wsum); -+ bfq_forget_idle(st); -+ } -+#ifndef BFQ_MQ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -+#endif -+#endif -+ st = bfq_entity_service_tree(&bfqq->entity); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", -+ served, ((st->vtime>>10)*1000)>>12, st); -+} -+ -+/** -+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length -+ * of the time interval during which bfqq has been in -+ * service. -+ * @bfqd: the device -+ * @bfqq: the queue that needs a service update. -+ * @time_ms: the amount of time during which the queue has received service -+ * -+ * If a queue does not consume its budget fast enough, then providing -+ * the queue with service fairness may impair throughput, more or less -+ * severely. For this reason, queues that consume their budget slowly -+ * are provided with time fairness instead of service fairness. This -+ * goal is achieved through the BFQ scheduling engine, even if such an -+ * engine works in the service, and not in the time domain. The trick -+ * is charging these queues with an inflated amount of service, equal -+ * to the amount of service that they would have received during their -+ * service slot if they had been fast, i.e., if their requests had -+ * been dispatched at a rate equal to the estimated peak rate. -+ * -+ * It is worth noting that time fairness can cause important -+ * distortions in terms of bandwidth distribution, on devices with -+ * internal queueing. The reason is that I/O requests dispatched -+ * during the service slot of a queue may be served after that service -+ * slot is finished, and may have a total processing time loosely -+ * correlated with the duration of the service slot. This is -+ * especially true for short service slots. -+ */ -+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ unsigned long time_ms) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); -+ unsigned long bounded_time_ms = min(time_ms, timeout_ms); -+ int serv_to_charge_for_time = -+ (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; -+ int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "%lu/%lu ms, %d/%d/%d/%d sectors", -+ time_ms, timeout_ms, -+ entity->service, -+ tot_serv_to_charge, -+ bfqd->bfq_max_budget, -+ entity->budget); -+ -+ /* Increase budget to avoid inconsistencies */ -+ if (tot_serv_to_charge > entity->budget) -+ entity->budget = tot_serv_to_charge; -+ -+ bfq_bfqq_served(bfqq, -+ max_t(int, 0, tot_serv_to_charge - entity->service)); -+} -+ -+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, -+ struct bfq_service_tree *st, -+ bool backshifted) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ /* -+ * When this function is invoked, entity is not in any service -+ * tree, then it is safe to invoke next function with the last -+ * parameter set (see the comments on the function). -+ */ -+ BUG_ON(entity->tree); -+ st = __bfq_entity_update_weight_prio(st, entity, true); -+ bfq_calc_finish(entity, entity->budget); -+ -+ /* -+ * If some queues enjoy backshifting for a while, then their -+ * (virtual) finish timestamps may happen to become lower and -+ * lower than the system virtual time. In particular, if -+ * these queues often happen to be idle for short time -+ * periods, and during such time periods other queues with -+ * higher timestamps happen to be busy, then the backshifted -+ * timestamps of the former queues can become much lower than -+ * the system virtual time. In fact, to serve the queues with -+ * higher timestamps while the ones with lower timestamps are -+ * idle, the system virtual time may be pushed-up to much -+ * higher values than the finish timestamps of the idle -+ * queues. As a consequence, the finish timestamps of all new -+ * or newly activated queues may end up being much larger than -+ * those of lucky queues with backshifted timestamps. The -+ * latter queues may then monopolize the device for a lot of -+ * time. This would simply break service guarantees. -+ * -+ * To reduce this problem, push up a little bit the -+ * backshifted timestamps of the queue associated with this -+ * entity (only a queue can happen to have the backshifted -+ * flag set): just enough to let the finish timestamp of the -+ * queue be equal to the current value of the system virtual -+ * time. This may introduce a little unfairness among queues -+ * with backshifted timestamps, but it does not break -+ * worst-case fairness guarantees. -+ * -+ * As a special case, if bfqq is weight-raised, push up -+ * timestamps much less, to keep very low the probability that -+ * this push up causes the backshifted finish timestamps of -+ * weight-raised queues to become higher than the backshifted -+ * finish timestamps of non weight-raised queues. -+ */ -+ if (backshifted && bfq_gt(st->vtime, entity->finish)) { -+ unsigned long delta = st->vtime - entity->finish; -+ -+ if (bfqq) -+ delta /= bfqq->wr_coeff; -+ -+ entity->start += delta; -+ entity->finish += delta; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new queue finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "new group finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#endif -+ } -+ } -+ -+ bfq_active_insert(st, entity); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "queue %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "group %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#endif -+ } -+ BUG_ON(RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(&st->active != &sd->service_tree->active && -+ &st->active != &(sd->service_tree+1)->active && -+ &st->active != &(sd->service_tree+2)->active); -+} -+ -+/** -+ * __bfq_activate_entity - handle activation of entity. -+ * @entity: the entity being activated. -+ * @non_blocking_wait_rq: true if entity was waiting for a request -+ * -+ * Called for a 'true' activation, i.e., if entity is not active and -+ * one of its children receives a new request. -+ * -+ * Basically, this function updates the timestamps of entity and -+ * inserts entity into its active tree, after possibly extracting it -+ * from its idle tree. -+ */ -+static void __bfq_activate_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ bool backshifted = false; -+ unsigned long long min_vstart; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ /* See comments on bfq_fqq_update_budg_for_activation */ -+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { -+ backshifted = true; -+ min_vstart = entity->finish; -+ } else -+ min_vstart = st->vtime; -+ -+ if (entity->tree == &st->idle) { -+ /* -+ * Must be on the idle tree, bfq_idle_extract() will -+ * check for that. -+ */ -+ bfq_idle_extract(st, entity); -+ BUG_ON(entity->tree); -+ entity->start = bfq_gt(min_vstart, entity->finish) ? -+ min_vstart : entity->finish; -+ } else { -+ BUG_ON(entity->tree); -+ /* -+ * The finish time of the entity may be invalid, and -+ * it is in the past for sure, otherwise the queue -+ * would have been on the idle tree. -+ */ -+ entity->start = min_vstart; -+ st->wsum += entity->weight; -+ /* -+ * entity is about to be inserted into a service tree, -+ * and then set in service: get a reference to make -+ * sure entity does not disappear until it is no -+ * longer in service or scheduled for service. -+ */ -+ bfq_get_entity(entity); -+ -+ BUG_ON(entity->on_st && bfqq); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (entity->on_st && !bfqq) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, -+ bfqg, -+ "activate bug, class %d in_service %p", -+ bfq_class_idx(entity), sd->in_service_entity); -+ } -+#endif -+ BUG_ON(entity->on_st && !bfqq); -+ entity->on_st = true; -+ } -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ struct bfq_data *bfqd = bfqg->bfqd; -+ -+ BUG_ON(!bfqd); -+ if (!entity->in_groups_with_pending_reqs) { -+ entity->in_groups_with_pending_reqs = true; -+ bfqd->num_groups_with_pending_reqs++; -+ } -+ bfq_log_bfqg(bfqd, bfqg, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+ } -+#endif -+ -+ bfq_update_fin_time_enqueue(entity, st, backshifted); -+} -+ -+/** -+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. -+ * @entity: the entity being requeued or repositioned. -+ * -+ * Requeueing is needed if this entity stops being served, which -+ * happens if a leaf descendant entity has expired. On the other hand, -+ * repositioning is needed if the next_inservice_entity for the child -+ * entity has changed. See the comments inside the function for -+ * details. -+ * -+ * Basically, this function: 1) removes entity from its active tree if -+ * present there, 2) updates the timestamps of entity and 3) inserts -+ * entity back into its active tree (in the new, right position for -+ * the new values of the timestamps). -+ */ -+static void __bfq_requeue_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree != &st->active); -+ -+ if (entity == sd->in_service_entity) { -+ /* -+ * We are requeueing the current in-service entity, -+ * which may have to be done for one of the following -+ * reasons: -+ * - entity represents the in-service queue, and the -+ * in-service queue is being requeued after an -+ * expiration; -+ * - entity represents a group, and its budget has -+ * changed because one of its child entities has -+ * just been either activated or requeued for some -+ * reason; the timestamps of the entity need then to -+ * be updated, and the entity needs to be enqueued -+ * or repositioned accordingly. -+ * -+ * In particular, before requeueing, the start time of -+ * the entity must be moved forward to account for the -+ * service that the entity has received while in -+ * service. This is done by the next instructions. The -+ * finish time will then be updated according to this -+ * new value of the start time, and to the budget of -+ * the entity. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree == &st->idle); -+ BUG_ON(entity->tree && entity->tree != &st->active); -+ /* -+ * In addition, if the entity had more than one child -+ * when set in service, then it was not extracted from -+ * the active tree. This implies that the position of -+ * the entity in the active tree may need to be -+ * changed now, because we have just updated the start -+ * time of the entity, and we will update its finish -+ * time in a moment (the requeueing is then, more -+ * precisely, a repositioning in this case). To -+ * implement this repositioning, we: 1) dequeue the -+ * entity here, 2) update the finish time and requeue -+ * the entity according to the new timestamps below. -+ */ -+ if (entity->tree) -+ bfq_active_extract(st, entity); -+ } else { /* The entity is already active, and not in service */ -+ /* -+ * In this case, this function gets called only if the -+ * next_in_service entity below this entity has -+ * changed, and this change has caused the budget of -+ * this entity to change, which, finally implies that -+ * the finish time of this entity must be -+ * updated. Such an update may cause the scheduling, -+ * i.e., the position in the active tree, of this -+ * entity to change. We handle this change by: 1) -+ * dequeueing the entity here, 2) updating the finish -+ * time and requeueing the entity according to the new -+ * timestamps below. This is the same approach as the -+ * non-extracted-entity sub-case above. -+ */ -+ bfq_active_extract(st, entity); -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, false); -+} -+ -+static void __bfq_activate_requeue_entity(struct bfq_entity *entity, -+ struct bfq_sched_data *sd, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (sd->in_service_entity == entity || entity->tree == &st->active) -+ /* -+ * in service or already queued on the active tree, -+ * requeue or reposition -+ */ -+ __bfq_requeue_entity(entity); -+ else -+ /* -+ * Not in service and not queued on its active tree: -+ * the activity is idle and this is a true activation. -+ */ -+ __bfq_activate_entity(entity, non_blocking_wait_rq); -+} -+ -+ -+/** -+ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. -+ * @entity: the entity to activate. -+ * @non_blocking_wait_rq: true if this entity was waiting for a request -+ * @requeue: true if this is a requeue, which implies that bfqq is -+ * being expired; thus ALL its ancestors stop being served and must -+ * therefore be requeued -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue -+ */ -+static void bfq_activate_requeue_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq, -+ bool requeue, bool expiration) -+{ -+ struct bfq_sched_data *sd; -+ -+ for_each_entity(entity) { -+ BUG_ON(!entity); -+ sd = entity->sched_data; -+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); -+ -+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); -+ -+ if (!bfq_update_next_in_service(sd, entity, expiration) && -+ !requeue) { -+ BUG_ON(!sd->next_in_service); -+ break; -+ } -+ BUG_ON(!sd->next_in_service); -+ } -+} -+ -+/** -+ * __bfq_deactivate_entity - update sched_data and service trees for -+ * entity, so as to represent entity as inactive -+ * @entity: the entity being deactivated. -+ * @ins_into_idle_tree: if false, the entity will not be put into the -+ * idle tree. -+ * -+ * If necessary and allowed, puts entity into the idle tree. NOTE: -+ * entity may be on no tree if in service. -+ */ -+static bool __bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st; -+ bool is_in_service; -+ -+ if (!entity->on_st) { /* entity never activated, or already inactive */ -+ BUG_ON(sd && entity == sd->in_service_entity); -+ return false; -+ } -+ -+ /* -+ * If we get here, then entity is active, which implies that -+ * bfq_group_set_parent has already been invoked for the group -+ * represented by entity. Therefore, the field -+ * entity->sched_data has been set, and we can safely use it. -+ */ -+ st = bfq_entity_service_tree(entity); -+ is_in_service = entity == sd->in_service_entity; -+ -+ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); -+ -+ bfq_calc_finish(entity, entity->service); -+ -+ if (is_in_service) { -+ sd->in_service_entity = NULL; -+ } else -+ /* -+ * Non in-service entity: nobody will take care of -+ * resetting its service counter on expiration. Do it -+ * now. -+ */ -+ entity->service = 0; -+ -+ if (entity->tree == &st->active) -+ bfq_active_extract(st, entity); -+ else if (!is_in_service && entity->tree == &st->idle) -+ bfq_idle_extract(st, entity); -+ else if (entity->tree) -+ BUG(); -+ -+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity, is_in_service); -+ else -+ bfq_idle_insert(st, entity); -+ -+ return true; -+} -+ -+/** -+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: true if the entity can be put into the idle tree -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue -+ */ -+static void bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree, -+ bool expiration) -+{ -+ struct bfq_sched_data *sd; -+ struct bfq_entity *parent = NULL; -+ -+ for_each_entity_safe(entity, parent) { -+ sd = entity->sched_data; -+ -+ BUG_ON(sd == NULL); /* -+ * It would mean that this is the -+ * root group. -+ */ -+ -+ BUG_ON(expiration && entity != sd->in_service_entity); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree == -+ &bfq_entity_service_tree(entity)->active && -+ !sd->next_in_service); -+ -+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { -+ /* -+ * entity is not in any tree any more, so -+ * this deactivation is a no-op, and there is -+ * nothing to change for upper-level entities -+ * (in case of expiration, this can never -+ * happen). -+ */ -+ BUG_ON(expiration); /* -+ * entity cannot be already out of -+ * any tree -+ */ -+ return; -+ } -+ -+ if (sd->next_in_service == entity) -+ /* -+ * entity was the next_in_service entity, -+ * then, since entity has just been -+ * deactivated, a new one must be found. -+ */ -+ bfq_update_next_in_service(sd, NULL, expiration); -+ -+ if (sd->next_in_service || sd->in_service_entity) { -+ /* -+ * The parent entity is still active, because -+ * either next_in_service or in_service_entity -+ * is not NULL. So, no further upwards -+ * deactivation must be performed. Yet, -+ * next_in_service has changed. Then the -+ * schedule does need to be updated upwards. -+ * -+ * NOTE If in_service_entity is not NULL, then -+ * next_in_service may happen to be NULL, -+ * although the parent entity is evidently -+ * active. This happens if 1) the entity -+ * pointed by in_service_entity is the only -+ * active entity in the parent entity, and 2) -+ * according to the definition of -+ * next_in_service, the in_service_entity -+ * cannot be considered as -+ * next_in_service. See the comments on the -+ * definition of next_in_service for details. -+ */ -+ BUG_ON(sd->next_in_service == entity); -+ BUG_ON(sd->in_service_entity == entity); -+ break; -+ } -+ -+ /* -+ * If we get here, then the parent is no more -+ * backlogged and we need to propagate the -+ * deactivation upwards. Thus let the loop go on. -+ */ -+ -+ /* -+ * Also let parent be queued into the idle tree on -+ * deactivation, to preserve service guarantees, and -+ * assuming that who invoked this function does not -+ * need parent entities too to be removed completely. -+ */ -+ ins_into_idle_tree = true; -+ } -+ -+ /* -+ * If the deactivation loop is fully executed, then there are -+ * no more entities to touch and next loop is not executed at -+ * all. Otherwise, requeue remaining entities if they are -+ * about to stop receiving service, or reposition them if this -+ * is not the case. -+ */ -+ entity = parent; -+ for_each_entity(entity) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ /* -+ * Invoke __bfq_requeue_entity on entity, even if -+ * already active, to requeue/reposition it in the -+ * active tree (because sd->next_in_service has -+ * changed) -+ */ -+ __bfq_requeue_entity(entity); -+ -+ sd = entity->sched_data; -+ BUG_ON(expiration && sd->in_service_entity != entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "invoking udpdate_next for this queue"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "invoking udpdate_next for this entity"); -+ } -+#endif -+ if (!bfq_update_next_in_service(sd, entity, expiration) && -+ !expiration) -+ /* -+ * next_in_service unchanged or not causing -+ * any change in entity->parent->sd, and no -+ * requeueing needed for expiration: stop -+ * here. -+ */ -+ break; -+ } -+} -+ -+/** -+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, -+ * if needed, to have at least one entity eligible. -+ * @st: the service tree to act upon. -+ * -+ * Assumes that st is not empty. -+ */ -+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); -+ -+ if (bfq_gt(root_entity->min_start, st->vtime)) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "new value %llu", -+ ((root_entity->min_start>>10)*1000)>>12); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(root_entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "new value %llu", -+ ((root_entity->min_start>>10)*1000)>>12); -+ } -+#endif -+ return root_entity->min_start; -+ } -+ return st->vtime; -+} -+ -+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -+{ -+ if (new_value > st->vtime) { -+ st->vtime = new_value; -+ bfq_forget_idle(st); -+ } -+} -+ -+/** -+ * bfq_first_active_entity - find the eligible entity with -+ * the smallest finish time -+ * @st: the service tree to select from. -+ * @vtime: the system virtual to use as a reference for eligibility -+ * -+ * This function searches the first schedulable entity, starting from the -+ * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path on -+ * the right is followed only if a) the left subtree contains no eligible -+ * entities and b) no eligible entity has been found yet. -+ */ -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, -+ u64 vtime) -+{ -+ struct bfq_entity *entry, *first = NULL; -+ struct rb_node *node = st->active.rb_node; -+ -+ while (node) { -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+left: -+ if (!bfq_gt(entry->start, vtime)) -+ first = entry; -+ -+ BUG_ON(bfq_gt(entry->min_start, vtime)); -+ -+ if (node->rb_left) { -+ entry = rb_entry(node->rb_left, -+ struct bfq_entity, rb_node); -+ if (!bfq_gt(entry->min_start, vtime)) { -+ node = node->rb_left; -+ goto left; -+ } -+ } -+ if (first) -+ break; -+ node = node->rb_right; -+ } -+ -+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); -+ return first; -+} -+ -+/** -+ * __bfq_lookup_next_entity - return the first eligible entity in @st. -+ * @st: the service tree. -+ * -+ * If there is no in-service entity for the sched_data st belongs to, -+ * then return the entity that will be set in service if: -+ * 1) the parent entity this st belongs to is set in service; -+ * 2) no entity belonging to such parent entity undergoes a state change -+ * that would influence the timestamps of the entity (e.g., becomes idle, -+ * becomes backlogged, changes its budget, ...). -+ * -+ * In this first case, update the virtual time in @st too (see the -+ * comments on this update inside the function). -+ * -+ * In constrast, if there is an in-service entity, then return the -+ * entity that would be set in service if not only the above -+ * conditions, but also the next one held true: the currently -+ * in-service entity, on expiration, -+ * 1) gets a finish time equal to the current one, or -+ * 2) is not eligible any more, or -+ * 3) is idle. -+ */ -+static struct bfq_entity * -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) -+{ -+ struct bfq_entity *entity; -+ u64 new_vtime; -+ struct bfq_queue *bfqq; -+ -+ if (RB_EMPTY_ROOT(&st->active)) -+ return NULL; -+ -+ /* -+ * Get the value of the system virtual time for which at -+ * least one entity is eligible. -+ */ -+ new_vtime = bfq_calc_vtime_jump(st); -+ -+ /* -+ * If there is no in-service entity for the sched_data this -+ * active tree belongs to, then push the system virtual time -+ * up to the value that guarantees that at least one entity is -+ * eligible. If, instead, there is an in-service entity, then -+ * do not make any such update, because there is already an -+ * eligible entity, namely the in-service one (even if the -+ * entity is not on st, because it was extracted when set in -+ * service). -+ */ -+ if (!in_service) -+ bfq_update_vtime(st, new_vtime); -+ -+ entity = bfq_first_active_entity(st, new_vtime); -+ BUG_ON(bfq_gt(entity->start, new_vtime)); -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "start %llu vtime %llu (%llu) st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((st->vtime>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+ } -+#endif -+ -+ BUG_ON(!entity); -+ -+ return entity; -+} -+ -+/** -+ * bfq_lookup_next_entity - return the first eligible entity in @sd. -+ * @sd: the sched_data. -+ * @expiration: true if we are on the expiration path of the in-service queue -+ * -+ * This function is invoked when there has been a change in the trees -+ * for sd, and we need to know what is the new next entity to serve -+ * after this change. -+ */ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ int class_idx = 0; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ /* -+ * Choose from idle class, if needed to guarantee a minimum -+ * bandwidth to this class (and if there is some active entity -+ * in idle class). This should also mitigate -+ * priority-inversion problems in case a low priority task is -+ * holding file system resources. -+ */ -+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -+ BFQ_CL_IDLE_TIMEOUT)) { -+ if (!RB_EMPTY_ROOT(&idle_class_st->active)) -+ class_idx = BFQ_IOPRIO_CLASSES - 1; -+ /* About to be served if backlogged, or not yet backlogged */ -+ sd->bfq_class_idle_last_service = jiffies; -+ } -+ -+ /* -+ * Find the next entity to serve for the highest-priority -+ * class, unless the idle class needs to be served. -+ */ -+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ /* -+ * If expiration is true, then bfq_lookup_next_entity -+ * is being invoked as a part of the expiration path -+ * of the in-service queue. In this case, even if -+ * sd->in_service_entity is not NULL, -+ * sd->in_service_entiy at this point is actually not -+ * in service any more, and, if needed, has already -+ * been properly queued or requeued into the right -+ * tree. The reason why sd->in_service_entity is still -+ * not NULL here, even if expiration is true, is that -+ * sd->in_service_entiy is reset as a last step in the -+ * expiration path. So, if expiration is true, tell -+ * __bfq_lookup_next_entity that there is no -+ * sd->in_service_entity. -+ */ -+ entity = __bfq_lookup_next_entity(st + class_idx, -+ sd->in_service_entity && -+ !expiration); -+ -+ if (entity) -+ break; -+ } -+ -+ BUG_ON(!entity && -+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || -+ !RB_EMPTY_ROOT(&(st+2)->active))); -+ -+ if (!entity) -+ return NULL; -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", -+ st + class_idx, class_idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen from st %p %d", -+ st + class_idx, class_idx); -+ } -+#endif -+ -+ return entity; -+} -+ -+static bool next_queue_may_preempt(struct bfq_data *bfqd) -+{ -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ return sd->next_in_service != sd->in_service_entity; -+} -+ -+/* -+ * Get next queue for service. -+ */ -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_sched_data *sd; -+ struct bfq_queue *bfqq; -+ -+ BUG_ON(bfqd->in_service_queue); -+ -+ if (bfq_tot_busy_queues(bfqd) == 0) -+ return NULL; -+ -+ /* -+ * Traverse the path from the root to the leaf entity to -+ * serve. Set in service all the entities visited along the -+ * way. -+ */ -+ sd = &bfqd->root_group->sched_data; -+ for (; sd ; sd = entity->my_sched_data) { -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "lookup in this group"); -+ if (!sd->next_in_service) -+ pr_crit("lookup in this group"); -+ } else { -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "lookup in root group"); -+ if (!sd->next_in_service) -+ pr_crit("lookup in root group"); -+ } -+#endif -+ -+ BUG_ON(!sd->next_in_service); -+ -+ /* -+ * WARNING. We are about to set the in-service entity -+ * to sd->next_in_service, i.e., to the (cached) value -+ * returned by bfq_lookup_next_entity(sd) the last -+ * time it was invoked, i.e., the last time when the -+ * service order in sd changed as a consequence of the -+ * activation or deactivation of an entity. In this -+ * respect, if we execute bfq_lookup_next_entity(sd) -+ * in this very moment, it may, although with low -+ * probability, yield a different entity than that -+ * pointed to by sd->next_in_service. This rare event -+ * happens in case there was no CLASS_IDLE entity to -+ * serve for sd when bfq_lookup_next_entity(sd) was -+ * invoked for the last time, while there is now one -+ * such entity. -+ * -+ * If the above event happens, then the scheduling of -+ * such entity in CLASS_IDLE is postponed until the -+ * service of the sd->next_in_service entity -+ * finishes. In fact, when the latter is expired, -+ * bfq_lookup_next_entity(sd) gets called again, -+ * exactly to update sd->next_in_service. -+ */ -+ -+ /* Make next_in_service entity become in_service_entity */ -+ entity = sd->next_in_service; -+ sd->in_service_entity = entity; -+ -+ /* -+ * If entity is no longer a candidate for next -+ * service, then it must be extracted from its active -+ * tree, so as to make sure that it won't be -+ * considered when computing next_in_service. See the -+ * comments on the function -+ * bfq_no_longer_next_in_service() for details. -+ */ -+ if (bfq_no_longer_next_in_service(entity)) -+ bfq_active_extract(bfq_entity_service_tree(entity), -+ entity); -+ -+ /* -+ * Even if entity is not to be extracted according to -+ * the above check, a descendant entity may get -+ * extracted in one of the next iterations of this -+ * loop. Such an event could cause a change in -+ * next_in_service for the level of the descendant -+ * entity, and thus possibly back to this level. -+ * -+ * However, we cannot perform the resulting needed -+ * update of next_in_service for this level before the -+ * end of the whole loop, because, to know which is -+ * the correct next-to-serve candidate entity for each -+ * level, we need first to find the leaf entity to set -+ * in service. In fact, only after we know which is -+ * the next-to-serve leaf entity, we can discover -+ * whether the parent entity of the leaf entity -+ * becomes the next-to-serve, and so on. -+ */ -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ -+ } -+ -+ BUG_ON(!entity); -+ bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!bfqq); -+ -+ /* -+ * We can finally update all next-to-serve entities along the -+ * path from the leaf entity just set in service to the root. -+ */ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ if (!bfq_update_next_in_service(sd, NULL, false)) -+ break; -+ } -+ -+ return bfqq; -+} -+ -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; -+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; -+ struct bfq_entity *entity = in_serv_entity; -+ -+#ifndef BFQ_MQ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+#endif -+ -+ bfq_clear_bfqq_wait_request(in_serv_bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqd->in_service_queue = NULL; -+ -+ /* -+ * When this function is called, all in-service entities have -+ * been properly deactivated or requeued, so we can safely -+ * execute the final step: reset in_service_entity along the -+ * path from entity to the root. -+ */ -+ for_each_entity(entity) -+ entity->sched_data->in_service_entity = NULL; -+ -+ /* -+ * in_serv_entity is no longer in service, so, if it is in no -+ * service tree either, then release the service reference to -+ * the queue it represents (taken with bfq_get_entity). -+ */ -+ if (!in_serv_entity->on_st) -+ bfq_put_queue(in_serv_bfqq); -+} -+ -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool ins_into_idle_tree, bool expiration) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -+} -+ -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && -+ entity->on_st); -+ -+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -+ false, false); -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_requeue_entity(entity, false, -+ bfqq == bfqd->in_service_queue, expiration); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -+ -+/* -+ * Called when the bfqq no longer has requests pending, remove it from -+ * the service tree. As a special case, it can be invoked during an -+ * expiration. -+ */ -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) -+{ -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); -+ -+ bfq_clear_bfqq_busy(bfqq); -+ -+ BUG_ON(bfq_tot_busy_queues(bfqd) == 0); -+ bfqd->busy_queues[bfqq->ioprio_class - 1]--; -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); -+ if (!bfqq->dispatched) -+ bfq_weights_tree_remove(bfqd, bfqq); -+} -+ -+/* -+ * Called when an inactive queue receives a new request. -+ */ -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); -+ -+ bfq_activate_bfqq(bfqd, bfqq); -+ -+ bfq_mark_bfqq_busy(bfqq); -+ bfqd->busy_queues[bfqq->ioprio_class - 1]++; -+ -+ if (!bfqq->dispatched) -+ if (bfqq->wr_coeff == 1) -+ bfq_weights_tree_add(bfqd, bfqq, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ } -+ -+} -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -new file mode 100644 -index 000000000000..6da94eef0cf1 ---- /dev/null -+++ b/block/bfq-sq-iosched.c -@@ -0,0 +1,5957 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * In particular, BFQ schedules I/O so as to achieve the latter goal-- -+ * low latency for interactive and soft real-time applications--if the -+ * low_latency parameter is set (default configuration). To this -+ * purpose, BFQ constantly tries to detect whether the I/O requests in -+ * a bfq_queue come from an interactive or a soft real-time -+ * application. For brevity, in these cases, the queue is said to be -+ * interactive or soft real-time. In both cases, BFQ privileges the -+ * service of the queue, over that of non-interactive and -+ * non-soft-real-time queues. This privileging is performed, mainly, -+ * by raising the weight of the queue. So, for brevity, we call just -+ * weight-raising periods the time periods during which a queue is -+ * privileged, because deemed interactive or soft real-time. -+ * -+ * The detection of soft real-time queues/applications is described in -+ * detail in the comments on the function -+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an -+ * interactive queue works as follows: a queue is deemed interactive -+ * if it is constantly non empty only for a limited time interval, -+ * after which it does become empty. The queue may be deemed -+ * interactive again (for a limited time), if it restarts being -+ * constantly non empty, provided that this happens only after the -+ * queue has remained empty for a given minimum idle time. -+ * -+ * By default, BFQ computes automatically the above maximum time -+ * interval, i.e., the time interval after which a constantly -+ * non-empty queue stops being deemed interactive. Since a queue is -+ * weight-raised while it is deemed interactive, this maximum time -+ * interval happens to coincide with the (maximum) duration of the -+ * weight-raising for interactive queues. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, -+ * more theoretical paper on BFQ can be found. The interested reader -+ * can find in the latter paper full details on the main algorithm, as -+ * well as formulas of the guarantees and formal proofs of all the -+ * properties. With respect to the version of BFQ presented in these -+ * papers, this implementation adds a few more heuristics, such as the -+ * one that guarantees a low latency to soft real-time applications, -+ * and a hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include <linux/module.h> -+#include <linux/slab.h> -+#include <linux/blkdev.h> -+#include <linux/cgroup.h> -+#include <linux/elevator.h> -+#include <linux/jiffies.h> -+#include <linux/rbtree.h> -+#include <linux/ioprio.h> -+#include "blk.h" -+#include "bfq.h" -+#include "blk-wbt.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * When a sync request is dispatched, the queue that contains that -+ * request, and all the ancestor entities of that queue, are charged -+ * with the number of sectors of the request. In constrast, if the -+ * request is async, then the queue and its ancestor entities are -+ * charged with the number of sectors of the request, multiplied by -+ * the factor below. This throttles the bandwidth for async I/O, -+ * w.r.t. to sync I/O, and it is done to counter the tendency of async -+ * writes to steal I/O throughput to reads. -+ * -+ * The current value of this parameter is the result of a tuning with -+ * several hardware and software configurations. We tried to find the -+ * lowest value for which writes do not cause noticeable problems to -+ * reads. In fact, the lower this parameter, the stabler I/O control, -+ * in the following respect. The lower this parameter is, the less -+ * the bandwidth enjoyed by a group decreases -+ * - when the group does writes, w.r.t. to when it does reads; -+ * - when other groups do reads, w.r.t. to when they do writes. -+ */ -+static const int bfq_async_charge_factor = 3; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 3 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ -+ (get_sdist(last_pos, rq) > \ -+ BFQQ_SEEK_THR && \ -+ (!blk_queue_nonrot(bfqd->queue) || \ -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * When configured for computing the duration of the weight-raising -+ * for interactive queues automatically (see the comments at the -+ * beginning of this file), BFQ does it using the following formula: -+ * duration = (ref_rate / r) * ref_wr_duration, -+ * where r is the peak rate of the device, and ref_rate and -+ * ref_wr_duration are two reference parameters. In particular, -+ * ref_rate is the peak rate of the reference storage device (see -+ * below), and ref_wr_duration is about the maximum time needed, with -+ * BFQ and while reading two files in parallel, to load typical large -+ * applications on the reference device (see the comments on -+ * max_service_from_wr below, for more details on how ref_wr_duration -+ * is obtained). In practice, the slower/faster the device at hand -+ * is, the more/less it takes to load applications with respect to the -+ * reference device. Accordingly, the longer/shorter BFQ grants -+ * weight raising to interactive applications. -+ * -+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), -+ * depending on whether the device is rotational or non-rotational. -+ * -+ * In the following definitions, ref_rate[0] and ref_wr_duration[0] -+ * are the reference values for a rotational device, whereas -+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a -+ * non-rotational device. The reference rates are not the actual peak -+ * rates of the devices used as a reference, but slightly lower -+ * values. The reason for using slightly lower values is that the -+ * peak-rate estimator tends to yield slightly lower values than the -+ * actual peak rate (it can yield the actual peak rate only if there -+ * is only one process doing I/O, and the process does sequential -+ * I/O). -+ * -+ * The reference peak rates are measured in sectors/usec, left-shifted -+ * by BFQ_RATE_SHIFT. -+ */ -+static int ref_rate[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize -+ * the following array, which entails that the array can be -+ * initialized only in a function. -+ */ -+static int ref_wr_duration[2]; -+ -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transfered. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, ""); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "%llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_better_to_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 4) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly -+ * the last two symmetry sub-conditions above would be quite complex -+ * and time consuming. Therefore this function evaluates, instead, -+ * only the following stronger three sub-conditions, for which it is -+ * much easier to maintain the needed state: -+ * 1) all active queues have the same weight, -+ * 2) all active queues belong to the same I/O-priority class, -+ * 3) there are no active groups. -+ * In particular, the last condition is always true if hierarchical -+ * support or the cgroups interface are not enabled, thus no state -+ * needs to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ /* -+ * For queue weights to differ, queue_weights_tree must contain -+ * at least two nodes. -+ */ -+ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right); -+ -+ bool multiple_classes_busy = -+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || -+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || -+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); -+ -+ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", -+ varied_queue_weights, multiple_classes_busy); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log(bfqd, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+#endif -+ -+ return !(varied_queue_weights || multiple_classes_busy -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ || bfqd->num_groups_with_pending_reqs > 0 -+#endif -+ ); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input queue, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the queue is already associated with a -+ * counter, which happens if: -+ * 1) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 2) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (bfqq->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ bfqq->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of queue to not be -+ * considered in bfq_symmetric_scenario, which, in its turn, -+ * causes the scenario to be deemed wrongly symmetric in case -+ * bfqq's weight would have been the only weight making the -+ * scenario asymmetric. On the bright side, no unbalance will -+ * however occur when bfqq becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of queue). In fact, bfq_weights_tree_remove does nothing -+ * if !bfqq->weight_counter. -+ */ -+ if (unlikely(!bfqq->weight_counter)) -+ return; -+ -+ bfqq->weight_counter->weight = entity->weight; -+ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); -+ rb_insert_color(&bfqq->weight_counter->weights_node, root); -+ -+inc_counter: -+ bfqq->weight_counter->num_active++; -+ bfqq->ref++; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+} -+ -+/* -+ * Decrement the weight counter associated with the queue, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void __bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct rb_root *root) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (!bfqq->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(bfqq->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!bfqq->weight_counter->num_active); -+ bfqq->weight_counter->num_active--; -+ -+ if (bfqq->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&bfqq->weight_counter->weights_node, root); -+ kfree(bfqq->weight_counter); -+ -+reset_entity_pointer: -+ bfqq->weight_counter = NULL; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "refs %d weight %d symmetric %d", -+ bfqq->ref, -+ entity->weight, -+ bfq_symmetric_scenario(bfqd)); -+ bfq_put_queue(bfqq); -+} -+ -+/* -+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number -+ * of active groups for each queue's inactive parent entity. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = bfqq->entity.parent; -+ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->my_sched_data; -+ -+ BUG_ON(entity->sched_data == NULL); /* -+ * It would mean -+ * that this is -+ * the root group. -+ */ -+ -+ if (sd->next_in_service || sd->in_service_entity) { -+ BUG_ON(!entity->in_groups_with_pending_reqs); -+ /* -+ * entity is still active, because either -+ * next_in_service or in_service_entity is not -+ * NULL (see the comments on the definition of -+ * next_in_service for details on why -+ * in_service_entity must be checked too). -+ * -+ * As a consequence, its parent entities are -+ * active as well, and thus this loop must -+ * stop here. -+ */ -+ break; -+ } -+ -+ BUG_ON(!bfqd->num_groups_with_pending_reqs && -+ entity->in_groups_with_pending_reqs); -+ /* -+ * The decrement of num_groups_with_pending_reqs is -+ * not performed immediately upon the deactivation of -+ * entity, but it is delayed to when it also happens -+ * that the first leaf descendant bfqq of entity gets -+ * all its pending requests completed. The following -+ * instructions perform this delayed decrement, if -+ * needed. See the comments on -+ * num_groups_with_pending_reqs for details. -+ */ -+ if (entity->in_groups_with_pending_reqs) { -+ entity->in_groups_with_pending_reqs = false; -+ bfqd->num_groups_with_pending_reqs--; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", -+ bfqd->num_groups_with_pending_reqs); -+ } -+ -+ /* -+ * Next function is invoked last, because it causes bfqq to be -+ * freed if the following holds: bfqq is not in service and -+ * has no dispatched request. DO NOT use bfqq after the next -+ * function invocation. -+ */ -+ __bfq_weights_tree_remove(bfqd, bfqq, -+ &bfqd->queue_weights_tree); -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqq->bfqd)) -+ return blk_rq_sectors(rq); -+ -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, -+ max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)), -+ entity->service); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq, false); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->rate_dur_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 25 seconds. The upper limit -+ * has been conservatively set after the following worst case: -+ * on a QEMU/KVM virtual machine -+ * - running in a slow PC -+ * - with a virtual disk stacked on a slow low-end 5400rpm HDD -+ * - serving a heavy I/O workload, such as the sequential reading -+ * of several files -+ * mplayer took 23 seconds to start, if constantly weight-raised. -+ * -+ * As for higher values than that accomodating the above bad -+ * scenario, tests show that higher values would often yield -+ * the opposite of the desired result, i.e., would worsen -+ * responsiveness by allowing non-interactive applications to -+ * preserve weight raising for too long. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); -+} -+ -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ } -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - -+ (bfqq->weight_counter != NULL); -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (entity->budget < entity->service) { -+ pr_crit("budget %d service %d\n", -+ entity->budget, entity->service); -+ BUG(); -+ } -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ /* -+ * In the next compound condition, we check also whether there -+ * is some budget left, because otherwise there is no point in -+ * trying to go on serving bfqq with this same budget: bfqq -+ * would be expired immediately after being selected for -+ * service. This would only cause useless overhead. -+ */ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && -+ bfq_bfqq_budget_left(bfqq) > 0) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ -+ /* -+ * At this point, we have used entity->service to get -+ * the budget left (needed for updating -+ * entity->budget). Thus we finally can, and have to, -+ * reset entity->service. The latter must be reset -+ * because bfqq would otherwise be charged again for -+ * the service it has received during its previous -+ * service slot(s). -+ */ -+ entity->service = 0; -+ -+ return true; -+ } -+ -+ /* -+ * We can finally complete expiration, by setting service to 0. -+ */ -+ entity->service = 0; -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->service_from_wr = 0; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start) && -+ bfqq->dispatched == 0; -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ /* -+ * NOTE: -+ * (bfqq->entity.service > bfqq->entity.budget) may hold here, -+ * in case of forced dispatches. -+ */ -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); -+ return false; -+ } -+ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfq_tot_busy_queues(bfqd) == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > -+ bfq_tot_busy_queues(bfqd)); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "cur-budget = %d prio_class %d", -+ bfqq->entity.budget, bfqq->ioprio_class); -+ } else -+ bfq_log(bfqd, "NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on the ref_wr_duration array. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, -+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20<<BFQ_RATE_SHIFT); -+ -+ /* -+ * Peak rate not updated if: -+ * - the percentage of sequential dispatches is below 3/4 of the -+ * total, and rate is below the current estimated peak rate -+ * - rate is unreasonably high (> 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<<BFQ_RATE_SHIFT) { -+ bfq_log(bfqd, -+ "goto reset, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+ bfqd->peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); -+ -+reset_computation: -+ bfq_reset_rate_computation(bfqd, rq); -+} -+ -+/* -+ * Update the read/write peak rate (the main quantity used for -+ * auto-tuning, see update_thr_responsiveness_params()). -+ * -+ * It is not trivial to estimate the peak rate (correctly): because of -+ * the presence of sw and hw queues between the scheduler and the -+ * device components that finally serve I/O requests, it is hard to -+ * say exactly when a given dispatched request is served inside the -+ * device, and for how long. As a consequence, it is hard to know -+ * precisely at what rate a given set of requests is actually served -+ * by the device. -+ * -+ * On the opposite end, the dispatch time of any request is trivially -+ * available, and, from this piece of information, the "dispatch rate" -+ * of requests can be immediately computed. So, the idea in the next -+ * function is to use what is known, namely request dispatch times -+ * (plus, when useful, request completion times), to estimate what is -+ * unknown, namely in-device request service rate. -+ * -+ * The main issue is that, because of the above facts, the rate at -+ * which a certain set of requests is dispatched over a certain time -+ * interval can vary greatly with respect to the rate at which the -+ * same requests are then served. But, since the size of any -+ * intermediate queue is limited, and the service scheme is lossless -+ * (no request is silently dropped), the following obvious convergence -+ * property holds: the number of requests dispatched MUST become -+ * closer and closer to the number of requests completed as the -+ * observation interval grows. This is the key property used in -+ * the next function to estimate the peak service rate as a function -+ * of the observed dispatch rate. The function assumes to be invoked -+ * on every request dispatch. -+ */ -+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) -+{ -+ u64 now_ns = ktime_get_ns(); -+ -+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ if (RQ_BFQQ(rq) == bfqd->in_service_queue) -+ bfqd->in_serv_last_pos = bfqd->last_position; -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq, true); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. -+ * -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) -+{ -+ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ blk_queue_nonrot(bfqq->bfqd->queue) && -+ bfqq->bfqd->hw_tag; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. And we do it, unless bfqq is in -+ * interactive weight raising. We do not do it in the -+ * latter subcase, for the following reason. bfqq may -+ * be conveying the I/O needed to load a soft -+ * real-time application. Such an application will -+ * actually exhibit a soft real-time I/O pattern after -+ * it finally starts doing its job. But, if -+ * soft_rt_next_start is computed here for an -+ * interactive bfqq, and bfqq had received a lot of -+ * service before remaining with no outstanding -+ * request (likely to happen on a fast device), then -+ * soft_rt_next_start would be assigned such a high -+ * value that, for a very long time, bfqq would be -+ * prevented from being possibly considered as soft -+ * real time. -+ * -+ * If, instead, the queue still has outstanding -+ * requests, then we have to wait for the completion -+ * of all the outstanding requests to discover whether -+ * the request pattern is actually isochronous. -+ */ -+ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); -+ if (bfqq->dispatched == 0 && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else if (bfqq->dispatched > 0) { -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", -+ reason_name[reason], slow, bfqq->dispatched, -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight, -+ entity->service, entity->budget); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ if (ref == 1) /* bfqq is gone, no more actions on it */ -+ return; -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ bfqq->injected_service = 0; -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (!bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(bfqq->next_rq); -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+ /* -+ * Not setting service to 0, because, if the next rq -+ * arrives in time, the queue will go on receiving -+ * service with this same budget (as if it never expired) -+ */ -+ } else { -+ entity->service = 0; -+ bfq_log_bfqq(bfqd, bfqq, "resetting service"); -+ } -+ -+ /* -+ * Reset the received-service counter for every parent entity. -+ * Differently from what happens with bfqq->entity.service, -+ * the resetting of this counter never needs to be postponed -+ * for parent entities. In fact, in case bfqq may have a -+ * chance to go on being served using the last, partially -+ * consumed budget, bfqq->entity.service needs to be kept, -+ * because if bfqq then actually goes on being served using -+ * the same budget, the last value of bfqq->entity.service is -+ * needed to properly decrement bfqq->entity.budget by the -+ * portion already consumed. In contrast, it is not necessary -+ * to keep entity->service for parent entities too, because -+ * the bubble up of the new value of bfqq->entity.budget will -+ * make sure that the budgets of parent entities are correct, -+ * even in case bfqq and thus parent entities go on receiving -+ * service with the same budget. -+ */ -+ entity = entity->parent; -+ for_each_entity(entity) -+ entity->service = 0; -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. -+ */ -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); -+ -+ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); -+ -+ /* -+ * The return value of this function is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the return value if -+ * there are weight-raised busy queues. In this case, and if -+ * bfqq is not weight-raised, this guarantees that the device -+ * is not idled for bfqq (if, instead, bfqq is weight-raised, -+ * then idling will be guaranteed by another variable, see -+ * below). Combined with the timestamping rules of BFQ (see -+ * [1] for details), this behavior causes bfqq, and hence any -+ * sync non-weight-raised queue, to get a lower number of -+ * requests served, and thus to ask for a lower number of -+ * requests from the request pool, before the busy -+ * weight-raised queues get served again. This often mitigates -+ * starvation problems in the presence of heavy write -+ * workloads and NCQ, thereby guaranteeing a higher -+ * application and system responsiveness in these hostile -+ * scenarios. -+ */ -+ return idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+} -+ -+/* -+ * There is a case where idling must be performed not for -+ * throughput concerns, but to preserve service guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) the I/O of each process has the same properties, in -+ * terms of locality (sequential or random), direction -+ * (reads or writes), request sizes, greediness -+ * (from I/O-bound to sporadic), and so on. -+ * In fact, in such a scenario, the drive tends to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * The problem is that idling may significantly reduce -+ * throughput with certain combinations of types of I/O and -+ * devices. An important example is sync random I/O, on flash -+ * storage with command queueing. So, unless bfqq falls in the -+ * above cases where idling also boosts throughput, it would -+ * be important to check conditions (i) and (ii) accurately, -+ * so as to avoid idling when not strictly needed for service -+ * guarantees. -+ * -+ * Unfortunately, it is extremely difficult to thoroughly -+ * check condition (ii). And, in case there are active groups, -+ * it becomes very difficult to check condition (i) too. In -+ * fact, if there are active groups, then, for condition (i) -+ * to become false, it is enough that an active group contains -+ * more active processes or sub-groups than some other active -+ * group. More precisely, for condition (i) to hold because of -+ * such a group, it is not even necessary that the group is -+ * (still) active: it is sufficient that, even if the group -+ * has become inactive, some of its descendant processes still -+ * have some request already dispatched but still waiting for -+ * completion. In fact, requests have still to be guaranteed -+ * their share of the throughput even after being -+ * dispatched. In this respect, it is easy to show that, if a -+ * group frequently becomes inactive while still having -+ * in-flight requests, and if, when this happens, the group is -+ * not considered in the calculation of whether the scenario -+ * is asymmetric, then the group may fail to be guaranteed its -+ * fair share of the throughput (basically because idling may -+ * not be performed for the descendant processes of the group, -+ * but it had to be). We address this issue with the -+ * following bi-modal behavior, implemented in the function -+ * bfq_symmetric_scenario(). -+ * -+ * If there are groups with requests waiting for completion -+ * (as commented above, some of these groups may even be -+ * already inactive), then the scenario is tagged as -+ * asymmetric, conservatively, without checking any of the -+ * conditions (i) and (ii). So the device is idled for bfqq. -+ * This behavior matches also the fact that groups are created -+ * exactly if controlling I/O is a primary concern (to -+ * preserve bandwidth and latency guarantees). -+ * -+ * On the opposite end, if there are no groups with requests -+ * waiting for completion, then only condition (i) is actually -+ * controlled, i.e., provided that condition (i) holds, idling -+ * is not performed, regardless of whether condition (ii) -+ * holds. In other words, only if condition (i) does not hold, -+ * then idling is allowed, and the device tends to be -+ * prevented from queueing many requests, possibly of several -+ * processes. Since there are no groups with requests waiting -+ * for completion, then, to control condition (i) it is enough -+ * to check just whether all the queues with requests waiting -+ * for completion also have the same weight. -+ * -+ * Not checking condition (ii) evidently exposes bfqq to the -+ * risk of getting less throughput than its fair share. -+ * However, for queues with the same weight, a further -+ * mechanism, preemption, mitigates or even eliminates this -+ * problem. And it does so without consequences on overall -+ * throughput. This mechanism and its benefits are explained -+ * in the next three paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * The motivation for using preemption instead of idling (for -+ * queues with the same weight) is that, by not idling, -+ * service guarantees are preserved (completely or at least in -+ * part) without minimally sacrificing throughput. And, if -+ * there is no active group, then the primary expectation for -+ * this device is probably a high throughput. -+ * -+ * We are now left only with explaining the additional -+ * compound condition that is checked below for deciding -+ * whether the scenario is asymmetric. To explain this -+ * compound condition, we need to add that the function -+ * bfq_symmetric_scenario checks the weights of only -+ * non-weight-raised queues, for efficiency reasons (see -+ * comments on bfq_weights_tree_add()). Then the fact that -+ * bfqq is weight-raised is checked explicitly here. More -+ * precisely, the compound condition below takes into account -+ * also the fact that, even if bfqq is being weight-raised, -+ * the scenario is still symmetric if all queues with requests -+ * waiting for completion happen to be -+ * weight-raised. Actually, we should be even more precise -+ * here, and differentiate between interactive weight raising -+ * and soft real-time weight raising. -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && -+ bfqd->wr_busy_queues < -+ bfq_tot_busy_queues(bfqd)) || -+ !bfq_symmetric_scenario(bfqd); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_coeff %d wr_busy %d busy %d asymmetric %d", -+ bfqq->wr_coeff, -+ bfqd->wr_busy_queues, -+ bfq_tot_busy_queues(bfqd), -+ asymmetric_scenario); -+ -+ return asymmetric_scenario; -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * Most of the issues taken into account to get the return value of -+ * this function are not trivial. We discuss these issues in the two -+ * functions providing the main pieces of information needed by this -+ * function. -+ */ -+static bool bfq_better_to_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; -+ -+ if (unlikely(bfqd->strict_guarantees)) -+ return true; -+ -+ /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ idling_boosts_thr_with_no_issue = -+ idling_boosts_thr_without_issues(bfqd, bfqq); -+ -+ idling_needed_for_service_guar = -+ idling_needed_for_service_guarantees(bfqd, bfqq); -+ -+ /* -+ * We have now the two components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_with_no_issue, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guar); -+ -+ return idling_boosts_thr_with_no_issue || -+ idling_needed_for_service_guar; -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_better_to_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_better_to_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_better_to_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); -+} -+ -+static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * A linear search; but, with a high probability, very few -+ * steps are needed to find a candidate queue, i.e., a queue -+ * with enough budget left for its next request. In fact: -+ * - BFQ dynamically updates the budget of every queue so as -+ * to accomodate the expected backlog of the queue; -+ * - if a queue gets all its requests dispatched as injected -+ * service, then the queue is removed from the active list -+ * (and re-added only if it gets new requests, but with -+ * enough budget for its new backlog). -+ */ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= -+ bfq_bfqq_budget_left(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ return bfqq; -+ } -+ -+ bfq_log(bfqd, "no queue found"); -+ return NULL; -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); -+ -+ /* -+ * Do not expire bfqq for budget timeout if bfqq may be about -+ * to enjoy device idling. The reason why, in this case, we -+ * prevent bfqq from expiring is the same as in the comments -+ * on the case where bfq_bfqq_must_idle() returns true, in -+ * bfq_completed_request(). -+ */ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ * -+ * Yet, to boost throughput, inject service from other queues if -+ * possible. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { -+ if (bfq_bfqq_injectable(bfqq) && -+ bfqq->injected_service * bfqq->inject_coeff < -+ bfqq->entity.service * 10) { -+ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); -+ bfqq = bfq_choose_bfqq_for_injection(bfqd); -+ } else { -+ if (BFQQ_SEEKY(bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "injection saturated %d * %d >= %d * 10", -+ bfqq->injected_service, bfqq->inject_coeff, -+ bfqq->entity.service); -+ bfqq = NULL; -+ } -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); -+ else -+ bfq_log(bfqd, "no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "too much service"); -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->dispatched); -+ -+ dispatched++; -+ -+ if (bfqq != bfqd->in_service_queue) { -+ if (likely(bfqd->in_service_queue)) { -+ bfqd->in_service_queue->injected_service += -+ bfq_serv_to_charge(rq, bfqq); -+ bfq_log_bfqq(bfqd, bfqd->in_service_queue, -+ "injected_service increased to %d", -+ bfqd->in_service_queue->injected_service); -+ } -+ return dispatched; -+ } -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ BUG_ON(!bfqd->in_service_bic); -+ } -+ -+ if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfq_tot_busy_queues(bfqd) != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); -+ -+ if (bfq_tot_busy_queues(bfqd) == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue && -+ bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "%s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { -+ hlist_del_init(&bfqq->burst_list_node); -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; -+ } -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ -+ if (!bfq_class_idle(bfqq)) -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ /* -+ * Aggressively inject a lot of service: up to 90%. -+ * This coefficient remains constant during bfqq life, -+ * but this behavior might be changed, after enough -+ * testing and tuning. -+ */ -+ bfqq->inject_coeff = 1; -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. -+ */ -+ bfqq->soft_rt_next_start = jiffies; -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); -+} -+ -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ bool has_short_ttime = true; -+ -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", -+ has_short_ttime); -+ -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if -+ * - the request is small, and -+ * - we are idling to boost throughput, and -+ * - the queue is not to be expired, -+ * then just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. In contrast -+ * we wait for the block layer to decide when to -+ * unplug the device: hopefully, new requests will be -+ * merged to this one quickly, then the device will be -+ * unplugged and larger requests will be dispatched. -+ */ -+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && -+ !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or idling is being -+ * performed to preserve service guarantees, or -+ * finally the queue is to be expired: in all these -+ * cases disk idling is to be stopped, so clear -+ * wait_request flag and reset timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ /* -+ * If active queue hasn't enough requests and can idle, bfq might not -+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this -+ * case -+ */ -+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && -+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < -+ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq->start_time_ns, -+ rq->io_start_time_ns, -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, bfqq); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < -+ 1UL<<(BFQ_RATE_SHIFT - 10)) -+ bfq_update_rate_reset(bfqd, NULL); -+ bfqd->last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * do not compute soft_rt_next_start if bfqq is in interactive -+ * weight raising (see the comments in bfq_bfqq_expire() for -+ * an explanation). We schedule this delayed update when bfqq -+ * expires, if it still has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfqq->wr_coeff != bfqd->bfq_wr_coeff) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfq_bfqq_must_idle(bfqq)) { -+ if (bfqq->dispatched == 0) -+ bfq_arm_slice_timer(bfqd); -+ /* -+ * If we get here, we do not expire bfqq, even -+ * if bfqq was in budget timeout or had no -+ * more requests (as controlled in the next -+ * conditional instructions). The reason for -+ * not expiring bfqq is as follows. -+ * -+ * Here bfqq->dispatched > 0 holds, but -+ * bfq_bfqq_must_idle() returned true. This -+ * implies that, even if no request arrives -+ * for bfqq before bfqq->dispatched reaches 0, -+ * bfqq will, however, not be expired on the -+ * completion event that causes bfqq->dispatch -+ * to reach zero. In contrast, on this event, -+ * bfqq will start enjoying device idling -+ * (I/O-dispatch plugging). -+ * -+ * But, if we expired bfqq here, bfqq would -+ * not have the chance to enjoy device idling -+ * when bfqq->dispatched finally reaches -+ * zero. This would expose bfqq to violation -+ * of its reserved service guarantees. -+ */ -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_better_to_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "%p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* release oom-queue reference to root group */ -+ bfqg_put(bfqd->root_group); -+ -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->num_groups_with_pending_reqs = 0; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device peak -+ * rate is equal to 2/3 of the highest reference rate. -+ */ -+ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * -+ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_registered_queue(struct request_queue *q) -+{ -+ wbt_disable_default(q); -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ .elevator_registered_fn = bfq_registered_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v9"; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definition of the next -+ * array). Actually, we use slightly lower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto slab_kill; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+slab_kill: -+ bfq_slab_kill(); -+err_pol_unreg: -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); -diff --git a/block/bfq.h b/block/bfq.h -new file mode 100644 -index 000000000000..0177fc7205d7 ---- /dev/null -+++ b/block/bfq.h -@@ -0,0 +1,1074 @@ -+/* -+ * BFQ v9: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> -+ * -+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> -+ * Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> -+ * -+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include <linux/hrtimer.h> -+#include <linux/blk-cgroup.h> -+ -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue in a hierarchical setup. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active queues -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the queues this counter refers to */ -+ unsigned int num_active; /* nr of active queues with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+ -+ /* flag, set if the entity is counted in groups_with_pending_reqs */ -+ bool in_groups_with_pending_reqs; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* pointer to the weight counter associated with this queue */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ -+ -+ /* max service rate measured so far */ -+ u32 max_service_rate; -+ /* -+ * Ratio between the service received by bfqq while it is in -+ * service, and the cumulative service (of requests of other -+ * queues) that may be injected while bfqq is empty but still -+ * in service. To increase precision, the coefficient is -+ * measured in tenths of unit. Here are some example of (1) -+ * ratios, (2) resulting percentages of service injected -+ * w.r.t. to the total service dispatched while bfqq is in -+ * service, and (3) corresponding values of the coefficient: -+ * 1 (50%) -> 10 -+ * 2 (33%) -> 20 -+ * 10 (9%) -> 100 -+ * 9.9 (9%) -> 99 -+ * 1.5 (40%) -> 15 -+ * 0.5 (66%) -> 5 -+ * 0.1 (90%) -> 1 -+ * -+ * So, if the coefficient is lower than 10, then -+ * injected service is more than bfqq service. -+ */ -+ unsigned int inject_coeff; -+ /* amount of service injected in current service slot */ -+ unsigned int injected_service; -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. -+ */ -+ bool saved_has_short_ttime; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ -+ /* -+ * Number of groups with at least one descendant process that -+ * has at least one request waiting for completion. Note that -+ * this accounts for also requests already dispatched, but not -+ * yet completed. Therefore this number of groups may differ -+ * (be larger) than the number of active groups, as a group is -+ * considered active only if its corresponding entity has -+ * descendant queues with at least one request queued. This -+ * number is used to decide whether a scenario is symmetric. -+ * For a detailed explanation see comments on the computation -+ * of the variable asymmetric_scenario in the function -+ * bfq_better_to_idle(). -+ * -+ * However, it is hard to compute this number exactly, for -+ * groups with multiple descendant processes. Consider a group -+ * that is inactive, i.e., that has no descendant process with -+ * pending I/O inside BFQ queues. Then suppose that -+ * num_groups_with_pending_reqs is still accounting for this -+ * group, because the group has descendant processes with some -+ * I/O request still in flight. num_groups_with_pending_reqs -+ * should be decremented when the in-flight request of the -+ * last descendant process is finally completed (assuming that -+ * nothing else has changed for the group in the meantime, in -+ * terms of composition of the group and active/inactive state of child -+ * groups and processes). To accomplish this, an additional -+ * pending-request counter must be added to entities, and must -+ * be updated correctly. To avoid this additional field and operations, -+ * we resort to the following tradeoff between simplicity and -+ * accuracy: for an inactive group that is still counted in -+ * num_groups_with_pending_reqs, we decrement -+ * num_groups_with_pending_reqs when the first descendant -+ * process of the group remains with no request waiting for -+ * completion. -+ * -+ * Even this simpler decrement strategy requires a little -+ * carefulness: to avoid multiple decrements, we flag a group, -+ * more precisely an entity representing a group, as still -+ * counted in num_groups_with_pending_reqs when it becomes -+ * inactive. Then, when the first descendant queue of the -+ * entity remains with no request waiting for completion, -+ * num_groups_with_pending_reqs is decremented, and this flag -+ * is reset. After this flag is reset for the entity, -+ * num_groups_with_pending_reqs won't be decremented any -+ * longer in case a new descendant queue of the entity remains -+ * with no request waiting for completion. -+ */ -+ unsigned int num_groups_with_pending_reqs; -+ -+ /* -+ * Per-class (RT, BE, IDLE) number of bfq_queues containing -+ * requests (including the queue in service, even if it is -+ * idling). -+ */ -+ unsigned int busy_queues[3]; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* position of the last served request for the in-service queue */ -+ sector_t in_serv_last_pos; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* -+ * Current estimate of the device peak rate, measured in -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by -+ * BFQ_RATE_SHIFT is performed to increase precision in -+ * fixed-point calculations. -+ */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product ref_rate*ref_wr_duration, used -+ * for computing the maximum duration of weight raising -+ * automatically. -+ */ -+ u64 rate_dur_prod; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(has_short_ttime); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, __func__, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __func__, ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include <linux/blktrace_api.h> -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, __func__, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ -+ __func__, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __func__, ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) -+{ -+ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + -+ bfqd->busy_queues[2]; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "%p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ -diff --git a/block/blk-mq.c b/block/blk-mq.c -index e3c39ea8e17b..7a57368841f6 100644 ---- a/block/blk-mq.c -+++ b/block/blk-mq.c -@@ -2878,6 +2878,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) - } - if (ret) - break; -+ if (q->elevator && q->elevator->type->ops.mq.depth_updated) -+ q->elevator->type->ops.mq.depth_updated(hctx); - } - - if (!ret) -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 6980014357d4..8c4568ea6884 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 5 -+#define BLKCG_MAX_POLS 7 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - -@@ -127,6 +127,10 @@ typedef __u32 __bitwise req_flags_t; - #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) - /* ->timeout has been called, don't expire again */ - #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) -+/* DEBUG: rq in bfq-mq dispatch list */ -+#define RQF_DISP_LIST ((__force req_flags_t)(1 << 22)) -+/* DEBUG: rq had get_rq_private executed on it */ -+#define RQF_GOT ((__force req_flags_t)(1 << 23)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ -diff --git a/include/linux/elevator.h b/include/linux/elevator.h -index a02deea30185..a2bf4a6b9316 100644 ---- a/include/linux/elevator.h -+++ b/include/linux/elevator.h -@@ -99,6 +99,7 @@ struct elevator_mq_ops { - void (*exit_sched)(struct elevator_queue *); - int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); - void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); -+ void (*depth_updated)(struct blk_mq_hw_ctx *); - - bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); - bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); diff --git a/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.14.95.ebuild b/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.14.95-r1.ebuild index fa4ea06b..27a6fcaa 100644 --- a/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.14.95.ebuild +++ b/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.14.95-r1.ebuild @@ -5,7 +5,7 @@ EAPI=6 inherit eutils -EXTRAVERSION="redcore-lts" +EXTRAVERSION="redcore-lts-r1" KV_FULL="${PV}-${EXTRAVERSION}" KV_MAJOR="4.14" @@ -15,7 +15,7 @@ SRC_URI="https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-${PV}.tar.xz" KEYWORDS="amd64" LICENSE="GPL-2" -SLOT="${PV}" +SLOT="${PVR}" IUSE="" RESTRICT="strip mirror" @@ -52,8 +52,6 @@ PATCHES=( "${FILESDIR}"/"${KV_MAJOR}"-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch "${FILESDIR}"/"${KV_MAJOR}"-0016-unfuck-MuQSS-on-linux-4_14_15+.patch "${FILESDIR}"/"${KV_MAJOR}"-0017-unfuck-MuQSS-on-linux-4_14_75+.patch - "${FILESDIR}"/"${KV_MAJOR}"-0001-BFQ-v8r12-20171108.patch - "${FILESDIR}"/"${KV_MAJOR}"-0002-BFQ-v8r12-20180404.patch ) S="${WORKDIR}"/linux-"${PV}" @@ -70,7 +68,7 @@ src_prepare() { default emake mrproper sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile - cp "${FILESDIR}"/"${KV_MAJOR}"-"${EXTRAVERSION}"-amd64.config .config + cp "${FILESDIR}"/"${KV_MAJOR}"-amd64.config .config rm -rf $(find . -type f|grep -F \.orig) } diff --git a/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.19.20.ebuild b/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.19.20-r1.ebuild index f19e2862..d2326b96 100644 --- a/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.19.20.ebuild +++ b/sys-kernel/linux-sources-redcore-lts/linux-sources-redcore-lts-4.19.20-r1.ebuild @@ -5,7 +5,7 @@ EAPI=6 inherit eutils -EXTRAVERSION="redcore-lts" +EXTRAVERSION="redcore-lts-r1" KV_FULL="${PV}-${EXTRAVERSION}" KV_MAJOR="4.19" @@ -15,7 +15,7 @@ SRC_URI="https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-${PV}.tar.xz" KEYWORDS="amd64" LICENSE="GPL-2" -SLOT="${PV}" +SLOT="${PVR}" IUSE="" RESTRICT="strip mirror" @@ -38,7 +38,6 @@ PATCHES=( "${FILESDIR}"/"${KV_MAJOR}"-revert-patches-causing-instant-reboot.patch "${FILESDIR}"/"${KV_MAJOR}"-linux-hardened.patch "${FILESDIR}"/"${KV_MAJOR}"-uksm-linux-hardened.patch - "${FILESDIR}"/"${KV_MAJOR}"-bfq-sq-mq-v9r1-2K190204-rc1.patch "${FILESDIR}"/"${KV_MAJOR}"-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch "${FILESDIR}"/"${KV_MAJOR}"-0002-Fix-Werror-build-failure-in-tools.patch "${FILESDIR}"/"${KV_MAJOR}"-0003-Make-preemptible-kernel-default.patch @@ -71,7 +70,7 @@ src_prepare() { default emake mrproper sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile - cp "${FILESDIR}"/"${KV_MAJOR}"-"${EXTRAVERSION}"-amd64.config .config + cp "${FILESDIR}"/"${KV_MAJOR}"-amd64.config .config rm -rf $(find . -type f|grep -F \.orig) } |