From b782bbfcb5e08e92c0448d0c6a870b44db198837 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 16 May 2016 11:16:17 +0200 Subject: [PATCH 4/4] Turn BFQ-v7r11 for 4.10.0 into BFQ-v8r8 for 4.10.0 Signed-off-by: Paolo Valente --- Documentation/block/00-INDEX | 2 + Documentation/block/bfq-iosched.txt | 530 ++++++ block/Kconfig.iosched | 18 +- block/bfq-cgroup.c | 510 +++--- block/bfq-iosched.c | 3414 ++++++++++++++++++++++------------- block/bfq-sched.c | 1290 ++++++++++--- block/bfq.h | 800 ++++---- 7 files changed, 4390 insertions(+), 2174 deletions(-) create mode 100644 Documentation/block/bfq-iosched.txt diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index e55103a..8d55b4b 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -1,5 +1,7 @@ 00-INDEX - This file +bfq-iosched.txt + - BFQ IO scheduler and its tunables biodoc.txt - Notes on the Generic Block Layer Rewrite in Linux 2.5 biovecs.txt diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt new file mode 100644 index 0000000..13b5248 --- /dev/null +++ b/Documentation/block/bfq-iosched.txt @@ -0,0 +1,530 @@ +BFQ (Budget Fair Queueing) +========================== + +BFQ is a proportional-share I/O scheduler, with some extra +low-latency capabilities. In addition to cgroups support (blkio or io +controllers), BFQ's main features are: +- BFQ guarantees a high system and application responsiveness, and a + low latency for time-sensitive applications, such as audio or video + players; +- BFQ distributes bandwidth, and not just time, among processes or + groups (switching back to time distribution when needed to keep + throughput high). + +On average CPUs, the current version of BFQ can handle devices +performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a +reference, 30-50 KIOPS correspond to very high bandwidths with +sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and +to 120-200 MB/s with 4KB random I/O. + +The table of contents follow. Impatients can just jump to Section 3. + +CONTENTS + +1. When may BFQ be useful? + 1-1 Personal systems + 1-2 Server systems +2. How does BFQ work? +3. What are BFQ's tunable? +4. BFQ group scheduling + 4-1 Service guarantees provided + 4-2 Interface + +1. When may BFQ be useful? +========================== + +BFQ provides the following benefits on personal and server systems. + +1-1 Personal systems +-------------------- + +Low latency for interactive applications + +Regardless of the actual background workload, BFQ guarantees that, for +interactive tasks, the storage device is virtually as responsive as if +it was idle. For example, even if one or more of the following +background workloads are being executed: +- one or more large files are being read, written or copied, +- a tree of source files is being compiled, +- one or more virtual machines are performing I/O, +- a software update is in progress, +- indexing daemons are scanning filesystems and updating their + databases, +starting an application or loading a file from within an application +takes about the same time as if the storage device was idle. As a +comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, +applications experience high latencies, or even become unresponsive +until the background workload terminates (also on SSDs). + +Low latency for soft real-time applications + +Also soft real-time applications, such as audio and video +players/streamers, enjoy a low latency and a low drop rate, regardless +of the background I/O workload. As a consequence, these applications +do not suffer from almost any glitch due to the background workload. + +Higher speed for code-development tasks + +If some additional workload happens to be executed in parallel, then +BFQ executes the I/O-related components of typical code-development +tasks (compilation, checkout, merge, ...) much more quickly than CFQ, +NOOP or DEADLINE. + +High throughput + +On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and +up to 150% higher throughput than DEADLINE and NOOP, with all the +sequential workloads considered in our tests. With random workloads, +and with all the workloads on flash-based devices, BFQ achieves, +instead, about the same throughput as the other schedulers. + +Strong fairness, bandwidth and delay guarantees + +BFQ distributes the device throughput, and not just the device time, +among I/O-bound applications in proportion their weights, with any +workload and regardless of the device parameters. From these bandwidth +guarantees, it is possible to compute tight per-I/O-request delay +guarantees by a simple formula. If not configured for strict service +guarantees, BFQ switches to time-based resource sharing (only) for +applications that would otherwise cause a throughput loss. + +1-2 Server systems +------------------ + +Most benefits for server systems follow from the same service +properties as above. In particular, regardless of whether additional, +possibly heavy workloads are being served, BFQ guarantees: + +. audio and video-streaming with zero or very low jitter and drop + rate; + +. fast retrieval of WEB pages and embedded objects; + +. real-time recording of data in live-dumping applications (e.g., + packet logging); + +. responsiveness in local and remote access to a server. + + +2. How does BFQ work? +===================== + +BFQ is a proportional-share I/O scheduler, whose general structure, +plus a lot of code, are borrowed from CFQ. + +- Each process doing I/O on a device is associated with a weight and a + (bfq_)queue. + +- BFQ grants exclusive access to the device, for a while, to one queue + (process) at a time, and implements this service model by + associating every queue with a budget, measured in number of + sectors. + + - After a queue is granted access to the device, the budget of the + queue is decremented, on each request dispatch, by the size of the + request. + + - The in-service queue is expired, i.e., its service is suspended, + only if one of the following events occurs: 1) the queue finishes + its budget, 2) the queue empties, 3) a "budget timeout" fires. + + - The budget timeout prevents processes doing random I/O from + holding the device for too long and dramatically reducing + throughput. + + - Actually, as in CFQ, a queue associated with a process issuing + sync requests may not be expired immediately when it empties. In + contrast, BFQ may idle the device for a short time interval, + giving the process the chance to go on being served if it issues + a new request in time. Device idling typically boosts the + throughput on rotational devices, if processes do synchronous + and sequential I/O. In addition, under BFQ, device idling is + also instrumental in guaranteeing the desired throughput + fraction to processes issuing sync requests (see the description + of the slice_idle tunable in this document, or [1, 2], for more + details). + + - With respect to idling for service guarantees, if several + processes are competing for the device at the same time, but + all processes (and groups, after the following commit) have + the same weight, then BFQ guarantees the expected throughput + distribution without ever idling the device. Throughput is + thus as high as possible in this common scenario. + + - If low-latency mode is enabled (default configuration), BFQ + executes some special heuristics to detect interactive and soft + real-time applications (e.g., video or audio players/streamers), + and to reduce their latency. The most important action taken to + achieve this goal is to give to the queues associated with these + applications more than their fair share of the device + throughput. For brevity, we call just "weight-raising" the whole + sets of actions taken by BFQ to privilege these queues. In + particular, BFQ provides a milder form of weight-raising for + interactive applications, and a stronger form for soft real-time + applications. + + - BFQ automatically deactivates idling for queues born in a burst of + queue creations. In fact, these queues are usually associated with + the processes of applications and services that benefit mostly + from a high throughput. Examples are systemd during boot, or git + grep. + + - As CFQ, BFQ merges queues performing interleaved I/O, i.e., + performing random I/O that becomes mostly sequential if + merged. Differently from CFQ, BFQ achieves this goal with a more + reactive mechanism, called Early Queue Merge (EQM). EQM is so + responsive in detecting interleaved I/O (cooperating processes), + that it enables BFQ to achieve a high throughput, by queue + merging, even for queues for which CFQ needs a different + mechanism, preemption, to get a high throughput. As such EQM is a + unified mechanism to achieve a high throughput with interleaved + I/O. + + - Queues are scheduled according to a variant of WF2Q+, named + B-WF2Q+, and implemented using an augmented rb-tree to preserve an + O(log N) overall complexity. See [2] for more details. B-WF2Q+ is + also ready for hierarchical scheduling. However, for a cleaner + logical breakdown, the code that enables and completes + hierarchical support is provided in the next commit, which focuses + exactly on this feature. + + - B-WF2Q+ guarantees a tight deviation with respect to an ideal, + perfectly fair, and smooth service. In particular, B-WF2Q+ + guarantees that each queue receives a fraction of the device + throughput proportional to its weight, even if the throughput + fluctuates, and regardless of: the device parameters, the current + workload and the budgets assigned to the queue. + + - The last, budget-independence, property (although probably + counterintuitive in the first place) is definitely beneficial, for + the following reasons: + + - First, with any proportional-share scheduler, the maximum + deviation with respect to an ideal service is proportional to + the maximum budget (slice) assigned to queues. As a consequence, + BFQ can keep this deviation tight not only because of the + accurate service of B-WF2Q+, but also because BFQ *does not* + need to assign a larger budget to a queue to let the queue + receive a higher fraction of the device throughput. + + - Second, BFQ is free to choose, for every process (queue), the + budget that best fits the needs of the process, or best + leverages the I/O pattern of the process. In particular, BFQ + updates queue budgets with a simple feedback-loop algorithm that + allows a high throughput to be achieved, while still providing + tight latency guarantees to time-sensitive applications. When + the in-service queue expires, this algorithm computes the next + budget of the queue so as to: + + - Let large budgets be eventually assigned to the queues + associated with I/O-bound applications performing sequential + I/O: in fact, the longer these applications are served once + got access to the device, the higher the throughput is. + + - Let small budgets be eventually assigned to the queues + associated with time-sensitive applications (which typically + perform sporadic and short I/O), because, the smaller the + budget assigned to a queue waiting for service is, the sooner + B-WF2Q+ will serve that queue (Subsec 3.3 in [2]). + +- If several processes are competing for the device at the same time, + but all processes and groups have the same weight, then BFQ + guarantees the expected throughput distribution without ever idling + the device. It uses preemption instead. Throughput is then much + higher in this common scenario. + +- ioprio classes are served in strict priority order, i.e., + lower-priority queues are not served as long as there are + higher-priority queues. Among queues in the same class, the + bandwidth is distributed in proportion to the weight of each + queue. A very thin extra bandwidth is however guaranteed to + the Idle class, to prevent it from starving. + + +3. What are BFQ's tunable? +========================== + +The tunables back_seek-max, back_seek_penalty, fifo_expire_async and +fifo_expire_sync below are the same as in CFQ. Their description is +just copied from that for CFQ. Some considerations in the description +of slice_idle are copied from CFQ too. + +per-process ioprio and weight +----------------------------- + +Unless the cgroups interface is used (see "4. BFQ group scheduling"), +weights can be assigned to processes only indirectly, through I/O +priorities, and according to the relation: +weight = (IOPRIO_BE_NR - ioprio) * 10. + +Beware that, if low-latency is set, then BFQ automatically raises the +weight of the queues associated with interactive and soft real-time +applications. Unset this tunable if you need/want to control weights. + +slice_idle +---------- + +This parameter specifies how long BFQ should idle for next I/O +request, when certain sync BFQ queues become empty. By default +slice_idle is a non-zero value. Idling has a double purpose: boosting +throughput and making sure that the desired throughput distribution is +respected (see the description of how BFQ works, and, if needed, the +papers referred there). + +As for throughput, idling can be very helpful on highly seeky media +like single spindle SATA/SAS disks where we can cut down on overall +number of seeks and see improved throughput. + +Setting slice_idle to 0 will remove all the idling on queues and one +should see an overall improved throughput on faster storage devices +like multiple SATA/SAS disks in hardware RAID configuration. + +So depending on storage and workload, it might be useful to set +slice_idle=0. In general for SATA/SAS disks and software RAID of +SATA/SAS disks keeping slice_idle enabled should be useful. For any +configurations where there are multiple spindles behind single LUN +(Host based hardware RAID controller or for storage arrays), setting +slice_idle=0 might end up in better throughput and acceptable +latencies. + +Idling is however necessary to have service guarantees enforced in +case of differentiated weights or differentiated I/O-request lengths. +To see why, suppose that a given BFQ queue A must get several I/O +requests served for each request served for another queue B. Idling +ensures that, if A makes a new I/O request slightly after becoming +empty, then no request of B is dispatched in the middle, and thus A +does not lose the possibility to get more than one request dispatched +before the next request of B is dispatched. Note that idling +guarantees the desired differentiated treatment of queues only in +terms of I/O-request dispatches. To guarantee that the actual service +order then corresponds to the dispatch order, the strict_guarantees +tunable must be set too. + +There is an important flipside for idling: apart from the above cases +where it is beneficial also for throughput, idling can severely impact +throughput. One important case is random workload. Because of this +issue, BFQ tends to avoid idling as much as possible, when it is not +beneficial also for throughput. As a consequence of this behavior, and +of further issues described for the strict_guarantees tunable, +short-term service guarantees may be occasionally violated. And, in +some cases, these guarantees may be more important than guaranteeing +maximum throughput. For example, in video playing/streaming, a very +low drop rate may be more important than maximum throughput. In these +cases, consider setting the strict_guarantees parameter. + +strict_guarantees +----------------- + +If this parameter is set (default: unset), then BFQ + +- always performs idling when the in-service queue becomes empty; + +- forces the device to serve one I/O request at a time, by dispatching a + new request only if there is no outstanding request. + +In the presence of differentiated weights or I/O-request sizes, both +the above conditions are needed to guarantee that every BFQ queue +receives its allotted share of the bandwidth. The first condition is +needed for the reasons explained in the description of the slice_idle +tunable. The second condition is needed because all modern storage +devices reorder internally-queued requests, which may trivially break +the service guarantees enforced by the I/O scheduler. + +Setting strict_guarantees may evidently affect throughput. + +back_seek_max +------------- + +This specifies, given in Kbytes, the maximum "distance" for backward seeking. +The distance is the amount of space from the current head location to the +sectors that are backward in terms of distance. + +This parameter allows the scheduler to anticipate requests in the "backward" +direction and consider them as being the "next" if they are within this +distance from the current head location. + +back_seek_penalty +----------------- + +This parameter is used to compute the cost of backward seeking. If the +backward distance of request is just 1/back_seek_penalty from a "front" +request, then the seeking cost of two requests is considered equivalent. + +So scheduler will not bias toward one or the other request (otherwise scheduler +will bias toward front request). Default value of back_seek_penalty is 2. + +fifo_expire_async +----------------- + +This parameter is used to set the timeout of asynchronous requests. Default +value of this is 248ms. + +fifo_expire_sync +---------------- + +This parameter is used to set the timeout of synchronous requests. Default +value of this is 124ms. In case to favor synchronous requests over asynchronous +one, this value should be decreased relative to fifo_expire_async. + +low_latency +----------- + +This parameter is used to enable/disable BFQ's low latency mode. By +default, low latency mode is enabled. If enabled, interactive and soft +real-time applications are privileged and experience a lower latency, +as explained in more detail in the description of how BFQ works. + +DO NOT enable this mode if you need full control on bandwidth +distribution. In fact, if it is enabled, then BFQ automatically +increases the bandwidth share of privileged applications, as the main +means to guarantee a lower latency to them. + +timeout_sync +------------ + +Maximum amount of device time that can be given to a task (queue) once +it has been selected for service. On devices with costly seeks, +increasing this time usually increases maximum throughput. On the +opposite end, increasing this time coarsens the granularity of the +short-term bandwidth and latency guarantees, especially if the +following parameter is set to zero. + +max_budget +---------- + +Maximum amount of service, measured in sectors, that can be provided +to a BFQ queue once it is set in service (of course within the limits +of the above timeout). According to what said in the description of +the algorithm, larger values increase the throughput in proportion to +the percentage of sequential I/O requests issued. The price of larger +values is that they coarsen the granularity of short-term bandwidth +and latency guarantees. + +The default value is 0, which enables auto-tuning: BFQ sets max_budget +to the maximum number of sectors that can be served during +timeout_sync, according to the estimated peak rate. + +weights +------- + +Read-only parameter, used to show the weights of the currently active +BFQ queues. + + +wr_ tunables +------------ + +BFQ exports a few parameters to control/tune the behavior of +low-latency heuristics. + +wr_coeff + +Factor by which the weight of a weight-raised queue is multiplied. If +the queue is deemed soft real-time, then the weight is further +multiplied by an additional, constant factor. + +wr_max_time + +Maximum duration of a weight-raising period for an interactive task +(ms). If set to zero (default value), then this value is computed +automatically, as a function of the peak rate of the device. In any +case, when the value of this parameter is read, it always reports the +current duration, regardless of whether it has been set manually or +computed automatically. + +wr_max_softrt_rate + +Maximum service rate below which a queue is deemed to be associated +with a soft real-time application, and is then weight-raised +accordingly (sectors/sec). + +wr_min_idle_time + +Minimum idle period after which interactive weight-raising may be +reactivated for a queue (in ms). + +wr_rt_max_time + +Maximum weight-raising duration for soft real-time queues (in ms). The +start time from which this duration is considered is automatically +moved forward if the queue is detected to be still soft real-time +before the current soft real-time weight-raising period finishes. + +wr_min_inter_arr_async + +Minimum period between I/O request arrivals after which weight-raising +may be reactivated for an already busy async queue (in ms). + + +4. Group scheduling with BFQ +============================ + +BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely +blkio and io. In particular, BFQ supports weight-based proportional +share. To activate cgroups support, set BFQ_GROUP_IOSCHED. + +4-1 Service guarantees provided +------------------------------- + +With BFQ, proportional share means true proportional share of the +device bandwidth, according to group weights. For example, a group +with weight 200 gets twice the bandwidth, and not just twice the time, +of a group with weight 100. + +BFQ supports hierarchies (group trees) of any depth. Bandwidth is +distributed among groups and processes in the expected way: for each +group, the children of the group share the whole bandwidth of the +group in proportion to their weights. In particular, this implies +that, for each leaf group, every process of the group receives the +same share of the whole group bandwidth, unless the ioprio of the +process is modified. + +The resource-sharing guarantee for a group may partially or totally +switch from bandwidth to time, if providing bandwidth guarantees to +the group lowers the throughput too much. This switch occurs on a +per-process basis: if a process of a leaf group causes throughput loss +if served in such a way to receive its share of the bandwidth, then +BFQ switches back to just time-based proportional share for that +process. + +4-2 Interface +------------- + +To get proportional sharing of bandwidth with BFQ for a given device, +BFQ must of course be the active scheduler for that device. + +Within each group directory, the names of the files associated with +BFQ-specific cgroup parameters and stats begin with the "bfq." +prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for +BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group +parameter to set the weight of a group with BFQ is blkio.bfq.weight +or io.bfq.weight. + +Parameters to set +----------------- + +For each group, there is only the following parameter to set. + +weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the +group inside its parent. Available values: 1..10000 (default 100). The +linear mapping between ioprio and weights, described at the beginning +of the tunable section, is still valid, but all weights higher than +IOPRIO_BE_NR*10 are mapped to ioprio 0. + +Recall that, if low-latency is set, then BFQ automatically raises the +weight of the queues associated with interactive and soft real-time +applications. Unset this tunable if you need/want to control weights. + + +[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O + Scheduler", Proceedings of the First Workshop on Mobile System + Technologies (MST-2015), May 2015. + http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf + +[2] P. Valente and M. Andreolini, "Improving Application + Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of + the 5th Annual International Systems and Storage Conference + (SYSTOR '12), June 2012. + Slightly extended version: + http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- + results.pdf diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index f78cd1a..f2cd945 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -43,20 +43,20 @@ config IOSCHED_BFQ tristate "BFQ I/O scheduler" default n ---help--- - The BFQ I/O scheduler tries to distribute bandwidth among - all processes according to their weights. - It aims at distributing the bandwidth as desired, independently of - the disk parameters and with any workload. It also tries to - guarantee low latency to interactive and soft real-time - applications. If compiled built-in (saying Y here), BFQ can - be configured to support hierarchical scheduling. + The BFQ I/O scheduler distributes bandwidth among all + processes according to their weights, regardless of the + device parameters and with any workload. It also guarantees + a low latency to interactive and soft real-time applications. + Details in Documentation/block/bfq-iosched.txt config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" - depends on CGROUPS && IOSCHED_BFQ=y + depends on IOSCHED_BFQ && BLK_CGROUP default n ---help--- - Enable hierarchical scheduling in BFQ, using the blkio controller. + + Enable hierarchical scheduling in BFQ, using the blkio + (cgroups-v1) or io (cgroups-v2) controller. choice prompt "Default I/O scheduler" diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 0367996..0125275 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -7,7 +7,9 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. @@ -163,8 +165,6 @@ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); - BUG_ON(!pd); - return pd_to_bfqg(pd); } @@ -208,59 +208,47 @@ static void bfqg_put(struct bfq_group *bfqg) static void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - int rw) + unsigned int op) { - blkg_rwstat_add(&bfqg->stats.queued, rw, 1); + blkg_rwstat_add(&bfqg->stats.queued, op, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } -static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) -{ - blkg_rwstat_add(&bfqg->stats.queued, rw, -1); -} - -static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) +static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { - blkg_rwstat_add(&bfqg->stats.merged, rw, 1); + blkg_rwstat_add(&bfqg->stats.queued, op, -1); } -static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, - uint64_t bytes, int rw) +static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { - blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); + blkg_rwstat_add(&bfqg->stats.merged, op, 1); } static void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int rw) + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { struct bfqg_stats *stats = &bfqg->stats; unsigned long long now = sched_clock(); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); + blkg_rwstat_add(&stats->service_time, op, + now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, rw, + blkg_rwstat_add(&stats->wait_time, op, io_start_time - start_time); } /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { - if (!stats) - return; - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->unaccounted_time); blkg_stat_reset(&stats->avg_queue_size_sum); blkg_stat_reset(&stats->avg_queue_size_samples); blkg_stat_reset(&stats->dequeue); @@ -270,19 +258,16 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) } /* @to += @from */ -static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) +static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); - blkg_rwstat_add_aux(&to->serviced, &from->serviced); blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); blkg_stat_add_aux(&from->time, &from->time); - blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); @@ -311,10 +296,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) if (unlikely(!parent)) return; - bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); - bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); + bfqg_stats_add_aux(&parent->stats, &bfqg->stats); bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); } static void bfq_init_entity(struct bfq_entity *entity, @@ -329,21 +312,17 @@ static void bfq_init_entity(struct bfq_entity *entity, bfqq->ioprio_class = bfqq->new_ioprio_class; bfqg_get(bfqg); } - entity->parent = bfqg->my_entity; + entity->parent = bfqg->my_entity; /* NULL for root group */ entity->sched_data = &bfqg->sched_data; } static void bfqg_stats_exit(struct bfqg_stats *stats) { - blkg_rwstat_exit(&stats->service_bytes); - blkg_rwstat_exit(&stats->serviced); blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->sectors); blkg_stat_exit(&stats->time); - blkg_stat_exit(&stats->unaccounted_time); blkg_stat_exit(&stats->avg_queue_size_sum); blkg_stat_exit(&stats->avg_queue_size_samples); blkg_stat_exit(&stats->dequeue); @@ -354,15 +333,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { - if (blkg_rwstat_init(&stats->service_bytes, gfp) || - blkg_rwstat_init(&stats->serviced, gfp) || - blkg_rwstat_init(&stats->merged, gfp) || + if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->sectors, gfp) || blkg_stat_init(&stats->time, gfp) || - blkg_stat_init(&stats->unaccounted_time, gfp) || blkg_stat_init(&stats->avg_queue_size_sum, gfp) || blkg_stat_init(&stats->avg_queue_size_samples, gfp) || blkg_stat_init(&stats->dequeue, gfp) || @@ -386,11 +361,27 @@ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +{ + struct bfq_group_data *bgd; + + bgd = kzalloc(sizeof(*bgd), gfp); + if (!bgd) + return NULL; + return &bgd->pd; +} + static void bfq_cpd_init(struct blkcg_policy_data *cpd) { struct bfq_group_data *d = cpd_to_bfqgd(cpd); - d->weight = BFQ_DEFAULT_GRP_WEIGHT; + d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; +} + +static void bfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_bfqgd(cpd)); } static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) @@ -401,8 +392,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) if (!bfqg) return NULL; - if (bfqg_stats_init(&bfqg->stats, gfp) || - bfqg_stats_init(&bfqg->dead_stats, gfp)) { + if (bfqg_stats_init(&bfqg->stats, gfp)) { kfree(bfqg); return NULL; } @@ -410,27 +400,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) return &bfqg->pd; } -static void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) +static void bfq_pd_init(struct blkg_policy_data *pd) { + struct blkcg_gq *blkg; + struct bfq_group *bfqg; + struct bfq_data *bfqd; struct bfq_entity *entity; + struct bfq_group_data *d; - BUG_ON(!parent); - BUG_ON(!bfqg); - BUG_ON(bfqg == parent); - + blkg = pd_to_blkg(pd); + BUG_ON(!blkg); + bfqg = blkg_to_bfqg(blkg); + bfqd = blkg->q->elevator->elevator_data; entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; -} - -static void bfq_pd_init(struct blkg_policy_data *pd) -{ - struct blkcg_gq *blkg = pd_to_blkg(pd); - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - struct bfq_data *bfqd = blkg->q->elevator->elevator_data; - struct bfq_entity *entity = &bfqg->entity; - struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); + d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; @@ -448,70 +431,53 @@ static void bfq_pd_free(struct blkg_policy_data *pd) struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); - bfqg_stats_exit(&bfqg->dead_stats); - return kfree(bfqg); } -/* offset delta from bfqg->stats to bfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - - offsetof(struct bfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { - u64 sum = 0; + struct bfq_group *bfqg = pd_to_bfqg(pd); - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - return sum; + bfqg_stats_reset(&bfqg->stats); } -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat -bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +static void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) { - struct blkg_rwstat a, b; + struct bfq_entity *entity; - a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - blkg_rwstat_add_aux(&a, &b); - return a; + BUG_ON(!parent); + BUG_ON(!bfqg); + BUG_ON(bfqg == parent); + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; } -static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, + struct blkcg *blkcg) { - struct bfq_group *bfqg = pd_to_bfqg(pd); + struct blkcg_gq *blkg; - bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); + blkg = blkg_lookup(blkcg, bfqd->queue); + if (likely(blkg)) + return blkg_to_bfqg(blkg); + return NULL; } -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - struct request_queue *q = bfqd->queue; - struct bfq_group *bfqg = NULL, *parent; - struct bfq_entity *entity = NULL; + struct bfq_group *bfqg, *parent; + struct bfq_entity *entity; assert_spin_locked(bfqd->queue->queue_lock); - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - bfqg = bfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - bfqg = blkg_to_bfqg(blkg); - else /* fallback to root_group */ - bfqg = bfqd->root_group; - } + bfqg = bfq_lookup_bfqg(bfqd, blkcg); - BUG_ON(!bfqg); + if (unlikely(!bfqg)) + return NULL; /* * Update chain of bfq_groups as we might be handling a leaf group @@ -537,11 +503,15 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. - * @entity: @bfqq's entity. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating @@ -552,26 +522,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, * rcu_read_lock()). */ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_entity *entity, struct bfq_group *bfqg) + struct bfq_group *bfqg) { - int busy, resume; - - busy = bfq_bfqq_busy(bfqq); - resume = !RB_EMPTY_ROOT(&bfqq->sort_list); + struct bfq_entity *entity = &bfqq->entity; - BUG_ON(resume && !entity->on_st); - BUG_ON(busy && !resume && entity->on_st && + BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); + BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) + && entity->on_st && bfqq != bfqd->in_service_queue); + BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); + + /* If bfqq is empty, then bfq_bfqq_expire also invokes + * bfq_del_bfqq_busy, thereby removing bfqq and its entity + * from data structures related to current group. Otherwise we + * need to remove bfqq explicitly with bfq_deactivate_bfqq, as + * we do below. + */ + if (bfqq == bfqd->in_service_queue) + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); - if (busy) { - BUG_ON(atomic_read(&bfqq->ref) < 2); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (!resume) - bfq_del_bfqq_busy(bfqd, bfqq, 0); - else - bfq_deactivate_bfqq(bfqd, bfqq, 0); - } else if (entity->on_st) + if (bfq_bfqq_busy(bfqq)) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); + else if (entity->on_st) { + BUG_ON(&bfq_entity_service_tree(entity)->idle != + entity->tree); bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + } bfqg_put(bfqq_group(bfqq)); /* @@ -583,14 +567,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, entity->sched_data = &bfqg->sched_data; bfqg_get(bfqg); - if (busy) { + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + if (bfq_bfqq_busy(bfqq)) { bfq_pos_tree_add_move(bfqd, bfqq); - if (resume) - bfq_activate_bfqq(bfqd, bfqq); + bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); } /** @@ -617,7 +604,11 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, lockdep_assert_held(bfqd->queue->queue_lock); - bfqg = bfq_find_alloc_group(bfqd, blkcg); + bfqg = bfq_find_set_group(bfqd, blkcg); + + if (unlikely(!bfqg)) + bfqg = bfqd->root_group; + if (async_bfqq) { entity = &async_bfqq->entity; @@ -625,7 +616,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, bic_set_bfqq(bic, NULL, 0); bfq_log_bfqq(bfqd, async_bfqq, "bic_change_group: %p %d", - async_bfqq, atomic_read(&async_bfqq->ref)); + async_bfqq, + async_bfqq->ref); bfq_put_queue(async_bfqq); } } @@ -633,7 +625,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, if (sync_bfqq) { entity = &sync_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); } return bfqg; @@ -642,25 +634,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct blkcg *blkcg; struct bfq_group *bfqg = NULL; - uint64_t id; + uint64_t serial_nr; rcu_read_lock(); - blkcg = bio_blkcg(bio); - id = blkcg->css.serial_nr; - rcu_read_unlock(); + serial_nr = bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ - if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) - return; + if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) + goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); - BUG_ON(!bfqg); - bic->blkcg_id = id; + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + bic->blkcg_serial_nr = serial_nr; +out: + rcu_read_unlock(); } /** @@ -672,7 +662,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) struct bfq_entity *entity = st->first_idle; for (; entity ; entity = st->first_idle) - __bfq_deactivate_entity(entity, 0); + __bfq_deactivate_entity(entity, false); } /** @@ -686,7 +676,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); BUG_ON(!bfqq); - bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); + bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** @@ -717,11 +707,12 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, } /** - * bfq_destroy_group - destroy @bfqg. - * @bfqg: the group being destroyed. + * bfq_pd_offline - deactivate the entity associated with @pd, + * and reparent its children entities. + * @pd: descriptor of the policy going offline. * - * Destroy @bfqg, making sure that it is not referenced from its parent. - * blkio already grabs the queue_lock for us, so no need to use RCU-based magic + * blkio already grabs the queue_lock for us, so no need to use + * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { @@ -776,10 +767,16 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) BUG_ON(bfqg->sched_data.next_in_service); BUG_ON(bfqg->sched_data.in_service_entity); - __bfq_deactivate_entity(entity, 0); + __bfq_deactivate_entity(entity, false); bfq_put_async_queues(bfqd, bfqg); BUG_ON(entity->tree); + /* + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so + * that they don't get lost. If IOs complete after this point, the + * stats for them will be lost. Oh well... + */ bfqg_stats_xfer_dead(bfqg); } @@ -789,46 +786,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); + BUG_ON(!bfqg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, - struct cftype *cftype) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - int ret = -EINVAL; - - spin_lock_irq(&blkcg->lock); - ret = bfqgd->weight; - spin_unlock_irq(&blkcg->lock); - - return ret; -} - -static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) +static int bfq_io_show_weight(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + unsigned int val = 0; - spin_lock_irq(&blkcg->lock); - seq_printf(sf, "%u\n", bfqgd->weight); - spin_unlock_irq(&blkcg->lock); + if (bfqgd) + val = bfqgd->weight; + + seq_printf(sf, "%u\n", val); return 0; } -static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 val) +static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; - int ret = -EINVAL; + int ret = -ERANGE; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; @@ -873,13 +859,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, return ret; } -static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) +static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { + u64 weight; /* First unsigned long found in the file is used */ - return bfqio_cgroup_weight_write(of_css(of), NULL, - simple_strtoull(strim(buf), NULL, 0)); + int ret = kstrtoull(strim(buf), 0, &weight); + + if (ret) + return ret; + + return bfq_io_set_weight_legacy(of_css(of), NULL, weight); } static int bfqg_print_stat(struct seq_file *sf, void *v) @@ -899,16 +890,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = bfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, + off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -928,6 +920,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); + return 0; +} + +static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, + false); + return 0; +} + + static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -964,38 +991,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) return blkg_to_bfqg(bfqd->queue->root_blkg); } -static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -{ - struct bfq_group_data *bgd; - - bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); - if (!bgd) - return NULL; - return &bgd->pd; -} - -static void bfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_bfqgd(cpd)); -} - -static struct cftype bfqio_files_dfl[] = { +static struct cftype bfq_blkcg_legacy_files[] = { { - .name = "weight", + .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfqio_cgroup_weight_read_dfl, - .write = bfqio_cgroup_weight_write_dfl, + .seq_show = bfq_io_show_weight, + .write_u64 = bfq_io_set_weight_legacy, }, - {} /* terminate */ -}; -static struct cftype bfqio_files[] = { - { - .name = "bfq.weight", - .read_u64 = bfqio_cgroup_weight_read, - .write_u64 = bfqio_cgroup_weight_write, - }, - /* statistics, cover only the tasks in the bfqg */ + /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), @@ -1003,18 +1007,17 @@ static struct cftype bfqio_files[] = { }, { .name = "bfq.sectors", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat, + .seq_show = bfqg_print_stat_sectors, }, { .name = "bfq.io_service_bytes", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "bfq.io_serviced", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios, }, { .name = "bfq.io_service_time", @@ -1045,18 +1048,17 @@ static struct cftype bfqio_files[] = { }, { .name = "bfq.sectors_recursive", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat_recursive, + .seq_show = bfqg_print_stat_sectors_recursive, }, { .name = "bfq.io_service_bytes_recursive", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "bfq.io_service_time_recursive", @@ -1102,31 +1104,42 @@ static struct cftype bfqio_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, - { - .name = "bfq.unaccounted_time", - .private = offsetof(struct bfq_group, stats.unaccounted_time), - .seq_show = bfqg_print_stat, - }, { } /* terminate */ }; -static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfqio_files_dfl, - .legacy_cftypes = bfqio_files, - - .pd_alloc_fn = bfq_pd_alloc, - .pd_init_fn = bfq_pd_init, - .pd_offline_fn = bfq_pd_offline, - .pd_free_fn = bfq_pd_free, - .pd_reset_stats_fn = bfq_pd_reset_stats, - - .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, - .cpd_free_fn = bfq_cpd_free, +static struct cftype bfq_blkg_files[] = { + { + .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, + {} /* terminate */ }; -#else +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, unsigned int op) { } +static inline void +bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } +static inline void +bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { } +static inline void +bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) { } +static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } +static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } + +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg) {} static void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) @@ -1142,35 +1155,22 @@ static void bfq_init_entity(struct bfq_entity *entity, entity->sched_data = &bfqg->sched_data; } -static struct bfq_group * -bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -{ - struct bfq_data *bfqd = bic_to_bfqd(bic); - - return bfqd->root_group; -} - -static void bfq_bfqq_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_entity *entity, - struct bfq_group *bfqg) -{ -} +static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} static void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static void bfq_disconnect_groups(struct bfq_data *bfqd) +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - bfq_put_async_queues(bfqd, bfqd->root_group); + return bfqd->root_group; } -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { - return bfqd->root_group; + return bfqq->bfqd->root_group; } static struct bfq_group * diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cf3e9b1..e5dfa5a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1,5 +1,5 @@ /* - * Budget Fair Queueing (BFQ) disk scheduler. + * Budget Fair Queueing (BFQ) I/O scheduler. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -7,25 +7,34 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2017 Paolo Valente * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. * - * BFQ is a proportional-share storage-I/O scheduling algorithm based on - * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, - * measured in number of sectors, to processes instead of time slices. The - * device is not granted to the in-service process for a given time slice, - * but until it has exhausted its assigned budget. This change from the time - * to the service domain allows BFQ to distribute the device throughput - * among processes as desired, without any distortion due to ZBR, workload - * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, - * called B-WF2Q+, to schedule processes according to their budgets. More - * precisely, BFQ schedules queues associated to processes. Thanks to the - * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to - * I/O-bound processes issuing sequential requests (to boost the - * throughput), and yet guarantee a low latency to interactive and soft - * real-time applications. + * BFQ is a proportional-share I/O scheduler, with some extra + * low-latency capabilities. BFQ also supports full hierarchical + * scheduling through cgroups. Next paragraphs provide an introduction + * on BFQ inner workings. Details on BFQ benefits and usage can be + * found in Documentation/block/bfq-iosched.txt. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based + * on the slice-by-slice service scheme of CFQ. But BFQ assigns + * budgets, measured in number of sectors, to processes instead of + * time slices. The device is not granted to the in-service process + * for a given time slice, but until it has exhausted its assigned + * budget. This change from the time to the service domain enables BFQ + * to distribute the device throughput among processes as desired, + * without any distortion due to throughput fluctuations, or to device + * internal queueing. BFQ uses an ad hoc internal scheduler, called + * B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated with processes. Thanks to + * the accurate policy of B-WF2Q+, BFQ can afford to assign high + * budgets to I/O-bound processes issuing sequential requests (to + * boost the throughput), and yet guarantee a low latency to + * interactive and soft real-time applications. * * BFQ is described in [1], where also a reference to the initial, more * theoretical paper on BFQ can be found. The interested reader can find @@ -40,10 +49,10 @@ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) * complexity derives from the one introduced with EEVDF in [3]. * - * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness - * with the BFQ Disk I/O Scheduler'', - * Proceedings of the 5th Annual International Systems and Storage - * Conference (SYSTOR '12), June 2012. + * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O + * Scheduler", Proceedings of the First Workshop on Mobile System + * Technologies (MST-2015), May 2015. + * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf * * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf * @@ -70,24 +79,23 @@ #include "bfq.h" #include "blk.h" -/* Expiration time of sync (0) and async (1) requests, in jiffies. */ -static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; +/* Expiration time of sync (0) and async (1) requests, in ns. */ +static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; /* Maximum backwards seek, in KiB. */ -static const int bfq_back_max = 16 * 1024; +static const int bfq_back_max = (16 * 1024); /* Penalty of a backwards seek, in number of sectors. */ static const int bfq_back_penalty = 2; -/* Idling period duration, in jiffies. */ -static int bfq_slice_idle = HZ / 125; +/* Idling period duration, in ns. */ +static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); /* Minimum number of assigned budgets for which stats are safe to compute. */ static const int bfq_stats_min_budgets = 194; /* Default maximum budget values, in sectors and number of requests. */ -static const int bfq_default_max_budget = 16 * 1024; -static const int bfq_max_budget_async_rq = 4; +static const int bfq_default_max_budget = (16 * 1024); /* * Async to sync throughput distribution is controlled as follows: @@ -97,23 +105,28 @@ static const int bfq_max_budget_async_rq = 4; static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ -static const int bfq_timeout_sync = HZ / 8; -static int bfq_timeout_async = HZ / 25; +static const int bfq_timeout = (HZ / 8); -struct kmem_cache *bfq_pool; +static struct kmem_cache *bfq_pool; -/* Below this threshold (in ms), we consider thinktime immediate. */ -#define BFQ_MIN_TT 2 +/* Below this threshold (in ns), we consider thinktime immediate. */ +#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) /* hw_tag detection: parallel requests threshold and min samples needed. */ #define BFQ_HW_QUEUE_THRESHOLD 4 #define BFQ_HW_QUEUE_SAMPLES 32 -#define BFQQ_SEEK_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) +#define BFQQ_SEEK_THR (sector_t)(8 * 100) +#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -/* Min samples used for peak rate estimation (for autotuning). */ -#define BFQ_PEAK_RATE_SAMPLES 32 +/* Min number of samples required to perform peak-rate update */ +#define BFQ_RATE_MIN_SAMPLES 32 +/* Min observation time interval required to perform a peak-rate update (ns) */ +#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) +/* Target observation time interval for a peak-rate update (ns) */ +#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC /* Shift used for peak rate fixed precision calculations. */ #define BFQ_RATE_SHIFT 16 @@ -141,16 +154,24 @@ struct kmem_cache *bfq_pool; * The device's speed class is dynamically (re)detected in * bfq_update_peak_rate() every time the estimated peak rate is updated. * - * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] - * are the reference values for a slow/fast rotational device, whereas - * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for - * a slow/fast non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. + * In the following definitions, R_slow[0]/R_fast[0] and + * T_slow[0]/T_fast[0] are the reference values for a slow/fast + * rotational device, whereas R_slow[1]/R_fast[1] and + * T_slow[1]/T_fast[1] are the reference values for a slow/fast + * non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. The reference + * rates are not the actual peak rates of the devices used as a + * reference, but slightly lower values. The reason for using these + * slightly lower values is that the peak-rate estimator tends to + * yield slightly lower values than the actual peak rate (it can yield + * the actual peak rate only if there is only one process doing I/O, + * and the process does sequential I/O). + * * Both the reference peak rates and the thresholds are measured in * sectors/usec, left-shifted by BFQ_RATE_SHIFT. */ -static int R_slow[2] = {1536, 10752}; -static int R_fast[2] = {17415, 34791}; +static int R_slow[2] = {1000, 10700}; +static int R_fast[2] = {14000, 33000}; /* * To improve readability, a conversion function is used to initialize the * following arrays, which entails that they can be initialized only in a @@ -178,18 +199,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); #define bfq_sample_valid(samples) ((samples) > 80) /* - * We regard a request as SYNC, if either it's a read or has the SYNC bit - * set (in which case it could also be a direct WRITE). - */ -static int bfq_bio_sync(struct bio *bio) -{ - if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) - return 1; - - return 0; -} - -/* * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. */ @@ -409,11 +418,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) */ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { - return -#ifdef CONFIG_BFQ_GROUP_IOSCHED - !bfqd->active_numerous_groups && -#endif - !bfq_differentiated_weights(bfqd); + return !bfq_differentiated_weights(bfqd); } /* @@ -505,13 +510,45 @@ static void bfq_weights_tree_remove(struct bfq_data *bfqd, entity->weight_counter = NULL; } +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq, + struct request *last) +{ + struct request *rq; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (rq == last || ktime_get_ns() < rq->fifo_time) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); + return rq; +} + static struct request *bfq_find_next_rq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *last) { struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; + struct request *next, *prev = NULL; + + BUG_ON(list_empty(&bfqq->fifo)); + + /* Follow expired path, else get first next available. */ + next = bfq_check_fifo(bfqq, last); + if (next) { + BUG_ON(next == last); + return next; + } BUG_ON(RB_EMPTY_NODE(&last->rb_node)); @@ -533,9 +570,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { - return blk_rq_sectors(rq) * - (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * - bfq_async_charge_factor)); + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) + return blk_rq_sectors(rq); + + /* + * If there are no weight-raised queues, then amplify service + * by just the async charge factor; otherwise amplify service + * by twice the async charge factor, to further reduce latency + * for weight-raised queues. + */ + if (bfqq->bfqd->wr_busy_queues == 0) + return blk_rq_sectors(rq) * bfq_async_charge_factor; + + return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; } /** @@ -576,7 +623,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); - bfq_activate_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq); } } @@ -590,12 +637,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) dur = bfqd->RT_prod; do_div(dur, bfqd->peak_rate); - return dur; -} + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); -static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) -{ - return bfqq->bic ? bfqq->bic->cooperations : 0; + return dur; } static void @@ -605,31 +663,31 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfq_mark_bfqq_idle_window(bfqq); else bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - /* Assuming that the flag in_large_burst is already correctly set */ - if (bic->wr_time_left && bfqq->bfqd->low_latency && - !bfq_bfqq_in_large_burst(bfqq) && - bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { - /* - * Start a weight raising period with the duration given by - * the raising_time_left snapshot. - */ - if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->wr_busy_queues++; - bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bic->wr_time_left; - bfqq->last_wr_start_finish = jiffies; - bfqq->entity.prio_changed = 1; + + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); + bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr (%lu + %lu < %lu)", + bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, + jiffies); + + bfqq->wr_coeff = 1; } - /* - * Clear wr_time_left to prevent bfq_bfqq_save_state() from - * getting confused about the queue's need of a weight-raising - * period. - */ - bic->wr_time_left = 0; + /* make sure weight will be updated, however we got here */ + bfqq->entity.prio_changed = 1; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -639,7 +697,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) lockdep_assert_held(bfqq->bfqd->queue->queue_lock); io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; - process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); return process_refs; } @@ -654,6 +712,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_del_init(&item->burst_list_node); hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); bfqd->burst_size = 1; + bfqd->burst_parent_entity = bfqq->entity.parent; } /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ @@ -662,6 +721,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* Increment burst size to take into account also bfqq */ bfqd->burst_size++; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); + + BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { struct bfq_queue *pos, *bfqq_item; struct hlist_node *n; @@ -671,15 +734,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * other to consider this burst as large. */ bfqd->large_burst = true; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); /* * We can now mark all queues in the burst list as * belonging to a large burst. */ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, - burst_list_node) + burst_list_node) { bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); + } bfq_mark_bfqq_in_large_burst(bfqq); + bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); /* * From now on, and until the current burst finishes, any @@ -691,67 +758,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&pos->burst_list_node); - } else /* burst not yet large: add bfqq to the burst list */ + } else /* + * Burst not yet large: add bfqq to the burst list. Do + * not increment the ref counter for bfqq, because bfqq + * is removed from the burst list before freeing bfqq + * in put_queue. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } /* - * If many queues happen to become active shortly after each other, then, - * to help the processes associated to these queues get their job done as - * soon as possible, it is usually better to not grant either weight-raising - * or device idling to these queues. In this comment we describe, firstly, - * the reasons why this fact holds, and, secondly, the next function, which - * implements the main steps needed to properly mark these queues so that - * they can then be treated in a different way. + * If many queues belonging to the same group happen to be created + * shortly after each other, then the processes associated with these + * queues have typically a common goal. In particular, bursts of queue + * creations are usually caused by services or applications that spawn + * many parallel threads/processes. Examples are systemd during boot, + * or git grep. To help these processes get their job done as soon as + * possible, it is usually better to not grant either weight-raising + * or device idling to their queues. * - * As for the terminology, we say that a queue becomes active, i.e., - * switches from idle to backlogged, either when it is created (as a - * consequence of the arrival of an I/O request), or, if already existing, - * when a new request for the queue arrives while the queue is idle. - * Bursts of activations, i.e., activations of different queues occurring - * shortly after each other, are typically caused by services or applications - * that spawn or reactivate many parallel threads/processes. Examples are - * systemd during boot or git grep. + * In this comment we describe, firstly, the reasons why this fact + * holds, and, secondly, the next function, which implements the main + * steps needed to properly mark these queues so that they can then be + * treated in a different way. * - * These services or applications benefit mostly from a high throughput: - * the quicker the requests of the activated queues are cumulatively served, - * the sooner the target job of these queues gets completed. As a consequence, - * weight-raising any of these queues, which also implies idling the device - * for it, is almost always counterproductive: in most cases it just lowers - * throughput. + * The above services or applications benefit mostly from a high + * throughput: the quicker the requests of the activated queues are + * cumulatively served, the sooner the target job of these queues gets + * completed. As a consequence, weight-raising any of these queues, + * which also implies idling the device for it, is almost always + * counterproductive. In most cases it just lowers throughput. * - * On the other hand, a burst of activations may be also caused by the start - * of an application that does not consist in a lot of parallel I/O-bound - * threads. In fact, with a complex application, the burst may be just a - * consequence of the fact that several processes need to be executed to - * start-up the application. To start an application as quickly as possible, - * the best thing to do is to privilege the I/O related to the application - * with respect to all other I/O. Therefore, the best strategy to start as - * quickly as possible an application that causes a burst of activations is - * to weight-raise all the queues activated during the burst. This is the + * On the other hand, a burst of queue creations may be caused also by + * the start of an application that does not consist of a lot of + * parallel I/O-bound threads. In fact, with a complex application, + * several short processes may need to be executed to start-up the + * application. In this respect, to start an application as quickly as + * possible, the best thing to do is in any case to privilege the I/O + * related to the application with respect to all other + * I/O. Therefore, the best strategy to start as quickly as possible + * an application that causes a burst of queue creations is to + * weight-raise all the queues created during the burst. This is the * exact opposite of the best strategy for the other type of bursts. * - * In the end, to take the best action for each of the two cases, the two - * types of bursts need to be distinguished. Fortunately, this seems - * relatively easy to do, by looking at the sizes of the bursts. In - * particular, we found a threshold such that bursts with a larger size - * than that threshold are apparently caused only by services or commands - * such as systemd or git grep. For brevity, hereafter we call just 'large' - * these bursts. BFQ *does not* weight-raise queues whose activations occur - * in a large burst. In addition, for each of these queues BFQ performs or - * does not perform idling depending on which choice boosts the throughput - * most. The exact choice depends on the device and request pattern at + * In the end, to take the best action for each of the two cases, the + * two types of bursts need to be distinguished. Fortunately, this + * seems relatively easy, by looking at the sizes of the bursts. In + * particular, we found a threshold such that only bursts with a + * larger size than that threshold are apparently caused by + * services or commands such as systemd or git grep. For brevity, + * hereafter we call just 'large' these bursts. BFQ *does not* + * weight-raise queues whose creation occurs in a large burst. In + * addition, for each of these queues BFQ performs or does not perform + * idling depending on which choice boosts the throughput more. The + * exact choice depends on the device and request pattern at * hand. * - * Turning back to the next function, it implements all the steps needed - * to detect the occurrence of a large burst and to properly mark all the - * queues belonging to it (so that they can then be treated in a different - * way). This goal is achieved by maintaining a special "burst list" that - * holds, temporarily, the queues that belong to the burst in progress. The - * list is then used to mark these queues as belonging to a large burst if - * the burst does become large. The main steps are the following. + * Unfortunately, false positives may occur while an interactive task + * is starting (e.g., an application is being started). The + * consequence is that the queues associated with the task do not + * enjoy weight raising as expected. Fortunately these false positives + * are very rare. They typically occur if some service happens to + * start doing I/O exactly when the interactive task starts. * - * . when the very first queue is activated, the queue is inserted into the + * Turning back to the next function, it implements all the steps + * needed to detect the occurrence of a large burst and to properly + * mark all the queues belonging to it (so that they can then be + * treated in a different way). This goal is achieved by maintaining a + * "burst list" that holds, temporarily, the queues that belong to the + * burst in progress. The list is then used to mark these queues as + * belonging to a large burst if the burst does become large. The main + * steps are the following. + * + * . when the very first queue is created, the queue is inserted into the * list (as it could be the first queue in a possible burst) * * . if the current burst has not yet become large, and a queue Q that does @@ -772,13 +851,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * * . the device enters a large-burst mode * - * . if a queue Q that does not belong to the burst is activated while + * . if a queue Q that does not belong to the burst is created while * the device is in large-burst mode and shortly after the last time * at which a queue either entered the burst list or was marked as * belonging to the current large burst, then Q is immediately marked * as belonging to a large burst. * - * . if a queue Q that does not belong to the burst is activated a while + * . if a queue Q that does not belong to the burst is created a while * later, i.e., not shortly after, than the last time at which a queue * either entered the burst list or was marked as belonging to the * current large burst, then the current burst is deemed as finished and: @@ -791,52 +870,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * in a possible new burst (then the burst list contains just Q * after this step). */ -static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool idle_for_long_time) +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* - * If bfqq happened to be activated in a burst, but has been idle - * for at least as long as an interactive queue, then we assume - * that, in the overall I/O initiated in the burst, the I/O - * associated to bfqq is finished. So bfqq does not need to be - * treated as a queue belonging to a burst anymore. Accordingly, - * we reset bfqq's in_large_burst flag if set, and remove bfqq - * from the burst list if it's there. We do not decrement instead - * burst_size, because the fact that bfqq does not need to belong - * to the burst list any more does not invalidate the fact that - * bfqq may have been activated during the current burst. - */ - if (idle_for_long_time) { - hlist_del_init(&bfqq->burst_list_node); - bfq_clear_bfqq_in_large_burst(bfqq); - } - - /* * If bfqq is already in the burst list or is part of a large - * burst, then there is nothing else to do. + * burst, or finally has just been split, then there is + * nothing else to do. */ if (!hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq)) + bfq_bfqq_in_large_burst(bfqq) || + time_is_after_eq_jiffies(bfqq->split_time + + msecs_to_jiffies(10))) return; /* - * If bfqq's activation happens late enough, then the current - * burst is finished, and related data structures must be reset. + * If bfqq's creation happens late enough, or bfqq belongs to + * a different group than the burst group, then the current + * burst is finished, and related data structures must be + * reset. * - * In this respect, consider the special case where bfqq is the very - * first queue being activated. In this case, last_ins_in_burst is - * not yet significant when we get here. But it is easy to verify - * that, whether or not the following condition is true, bfqq will - * end up being inserted into the burst list. In particular the - * list will happen to contain only bfqq. And this is exactly what - * has to happen, as bfqq may be the first queue in a possible + * In this respect, consider the special case where bfqq is + * the very first queue created after BFQ is selected for this + * device. In this case, last_ins_in_burst and + * burst_parent_entity are not yet significant when we get + * here. But it is easy to verify that, whether or not the + * following condition is true, bfqq will end up being + * inserted into the burst list. In particular the list will + * happen to contain only bfqq. And this is exactly what has + * to happen, as bfqq may be the first queue of the first * burst. */ if (time_is_before_jiffies(bfqd->last_ins_in_burst + - bfqd->bfq_burst_interval)) { + bfqd->bfq_burst_interval) || + bfqq->entity.parent != bfqd->burst_parent_entity) { bfqd->large_burst = false; bfq_reset_burst_list(bfqd, bfqq); - return; + bfq_log_bfqq(bfqd, bfqq, + "handle_burst: late activation or different group"); + goto end; } /* @@ -845,8 +916,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, * bfqq as belonging to this large burst immediately. */ if (bfqd->large_burst) { + bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); bfq_mark_bfqq_in_large_burst(bfqq); - return; + goto end; } /* @@ -855,25 +927,490 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, * queue. Then we add bfqq to the burst. */ bfq_add_to_burst(bfqd, bfqq); +end: + /* + * At this point, bfqq either has been added to the current + * burst or has caused the current burst to terminate and a + * possible new burst to start. In particular, in the second + * case, bfqq has become the first queue in the possible new + * burst. In both cases last_ins_in_burst needs to be moved + * forward. + */ + bfqd->last_ins_in_burst = jiffies; + +} + +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + return entity->budget - entity->service; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static int bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static int bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + +/* + * The next function, invoked after the input queue bfqq switches from + * idle to busy, updates the budget of bfqq. The function also tells + * whether the in-service queue should be expired, by returning + * true. The purpose of expiring the in-service queue is to give bfqq + * the chance to possibly preempt the in-service queue, and the reason + * for preempting the in-service queue is to achieve one of the two + * goals below. + * + * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has + * expired because it has remained idle. In particular, bfqq may have + * expired for one of the following two reasons: + * + * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and + * did not make it to issue a new request before its last request + * was served; + * + * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue + * a new request before the expiration of the idling-time. + * + * Even if bfqq has expired for one of the above reasons, the process + * associated with the queue may be however issuing requests greedily, + * and thus be sensitive to the bandwidth it receives (bfqq may have + * remained idle for other reasons: CPU high load, bfqq not enjoying + * idling, I/O throttling somewhere in the path from the process to + * the I/O scheduler, ...). But if, after every expiration for one of + * the above two reasons, bfqq has to wait for the service of at least + * one full budget of another queue before being served again, then + * bfqq is likely to get a much lower bandwidth or resource time than + * its reserved ones. To address this issue, two countermeasures need + * to be taken. + * + * First, the budget and the timestamps of bfqq need to be updated in + * a special way on bfqq reactivation: they need to be updated as if + * bfqq did not remain idle and did not expire. In fact, if they are + * computed as if bfqq expired and remained idle until reactivation, + * then the process associated with bfqq is treated as if, instead of + * being greedy, it stopped issuing requests when bfqq remained idle, + * and restarts issuing requests only on this reactivation. In other + * words, the scheduler does not help the process recover the "service + * hole" between bfqq expiration and reactivation. As a consequence, + * the process receives a lower bandwidth than its reserved one. In + * contrast, to recover this hole, the budget must be updated as if + * bfqq was not expired at all before this reactivation, i.e., it must + * be set to the value of the remaining budget when bfqq was + * expired. Along the same line, timestamps need to be assigned the + * value they had the last time bfqq was selected for service, i.e., + * before last expiration. Thus timestamps need to be back-shifted + * with respect to their normal computation (see [1] for more details + * on this tricky aspect). + * + * Secondly, to allow the process to recover the hole, the in-service + * queue must be expired too, to give bfqq the chance to preempt it + * immediately. In fact, if bfqq has to wait for a full budget of the + * in-service queue to be completed, then it may become impossible to + * let the process recover the hole, even if the back-shifted + * timestamps of bfqq are lower than those of the in-service queue. If + * this happens for most or all of the holes, then the process may not + * receive its reserved bandwidth. In this respect, it is worth noting + * that, being the service of outstanding requests unpreemptible, a + * little fraction of the holes may however be unrecoverable, thereby + * causing a little loss of bandwidth. + * + * The last important point is detecting whether bfqq does need this + * bandwidth recovery. In this respect, the next function deems the + * process associated with bfqq greedy, and thus allows it to recover + * the hole, if: 1) the process is waiting for the arrival of a new + * request (which implies that bfqq expired for one of the above two + * reasons), and 2) such a request has arrived soon. The first + * condition is controlled through the flag non_blocking_wait_rq, + * while the second through the flag arrived_in_time. If both + * conditions hold, then the function computes the budget in the + * above-described special way, and signals that the in-service queue + * should be expired. Timestamp back-shifting is done later in + * __bfq_activate_entity. + * + * 2. Reduce latency. Even if timestamps are not backshifted to let + * the process associated with bfqq recover a service hole, bfqq may + * however happen to have, after being (re)activated, a lower finish + * timestamp than the in-service queue. That is, the next budget of + * bfqq may have to be completed before the one of the in-service + * queue. If this is the case, then preempting the in-service queue + * allows this goal to be achieved, apart from the unpreemptible, + * outstanding requests mentioned above. + * + * Unfortunately, regardless of which of the above two goals one wants + * to achieve, service trees need first to be updated to know whether + * the in-service queue must be preempted. To have service trees + * correctly updated, the in-service queue must be expired and + * rescheduled, and bfqq must be scheduled too. This is one of the + * most costly operations (in future versions, the scheduling + * mechanism may be re-designed in such a way to make it possible to + * know whether preemption is needed without needing to update service + * trees). In addition, queue preemptions almost always cause random + * I/O, and thus loss of throughput. Because of these facts, the next + * function adopts the following simple scheme to avoid both costly + * operations and too frequent preemptions: it requests the expiration + * of the in-service queue (unconditionally) only for queues that need + * to recover a hole, or that either are weight-raised or deserve to + * be weight-raised. + */ +static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool arrived_in_time, + bool wr_or_deserves_wr) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { + /* + * We do not clear the flag non_blocking_wait_rq here, as + * the latter is used in bfq_activate_bfqq to signal + * that timestamps need to be back-shifted (and is + * cleared right after). + */ + + /* + * In next assignment we rely on that either + * entity->service or entity->budget are not updated + * on expiration if bfqq is empty (see + * __bfq_bfqq_recalc_budget). Thus both quantities + * remain unchanged after such an expiration, and the + * following statement therefore assigns to + * entity->budget the remaining budget on such an + * expiration. For clarity, entity->service is not + * updated on expiration in any case, and, in normal + * operation, is reset only when bfqq is selected for + * service (see bfq_get_next_queue). + */ + BUG_ON(bfqq->max_budget < 0); + entity->budget = min_t(unsigned long, + bfq_bfqq_budget_left(bfqq), + bfqq->max_budget); + + BUG_ON(entity->budget < 0); + return true; + } + + BUG_ON(bfqq->max_budget < 0); + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(bfqq->next_rq, bfqq)); + BUG_ON(entity->budget < 0); + + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + return wr_or_deserves_wr; +} + +static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, + bool wr_or_deserves_wr, + bool interactive, + bool in_burst, + bool soft_rt) +{ + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { + bfqq->wr_start_at_switch_to_srt = jiffies; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + /* + * If needed, further reduce budget to make sure it is + * close to bfqq's backlog, so as to reduce the + * scheduling-error component due to a too large + * budget. Do not care about throughput consequences, + * but only about latency. Finally, do not assign a + * too small budget either, to avoid increasing + * latency by causing too frequent expirations. + */ + bfqq->entity.budget = min_t(unsigned long, + bfqq->entity.budget, + 2 * bfq_min_budget(bfqd)); + + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) { /* update wr coeff and duration */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else if (in_burst) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (soft_rt) { + /* + * The application is now or still meeting the + * requirements for being deemed soft rt. We + * can then correctly and safely (re)charge + * the weight-raising duration for the + * application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + if (bfqq->wr_cur_max_time != + bfqd->bfq_wr_rt_max_time) { + bfqq->wr_start_at_switch_to_srt = + bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfq_log_bfqq(bfqd, bfqq, + "switching to soft_rt wr"); + } else + bfq_log_bfqq(bfqd, bfqq, + "moving forward soft_rt wr duration"); + bfqq->last_wr_start_finish = jiffies; + } + } +} + +static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return bfqq->dispatched == 0 && + time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +} + +static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int old_wr_coeff, + struct request *rq, + bool *interactive) +{ + bool soft_rt, in_burst, wr_or_deserves_wr, + bfqq_wants_to_preempt, + idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), + /* + * See the comments on + * bfq_bfqq_update_budg_for_activation for + * details on the usage of the next variable. + */ + arrived_in_time = ktime_get_ns() <= + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle * 3; + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request non-busy: " + "jiffies %lu, in_time %d, idle_long %d busyw %d " + "wr_coeff %u", + jiffies, arrived_in_time, + idle_for_long_time, + bfq_bfqq_non_blocking_wait_rq(bfqq), + old_wr_coeff); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfqq == bfqd->in_service_queue); + bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); + + /* + * bfqq deserves to be weight-raised if: + * - it is sync, + * - it does not belong to a large burst, + * - it has been idle for enough time or is soft real-time, + * - is linked to a bfq_io_cq (it is not shared in any sense) + */ + in_burst = bfq_bfqq_in_large_burst(bfqq); + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + *interactive = + !in_burst && + idle_for_long_time; + wr_or_deserves_wr = bfqd->low_latency && + (bfqq->wr_coeff > 1 || + (bfq_bfqq_sync(bfqq) && + bfqq->bic && (*interactive || soft_rt))); + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request: " + "in_burst %d, " + "soft_rt %d (next %lu), inter %d, bic %p", + bfq_bfqq_in_large_burst(bfqq), soft_rt, + bfqq->soft_rt_next_start, + *interactive, + bfqq->bic); + + /* + * Using the last flag, update budget and check whether bfqq + * may want to preempt the in-service queue. + */ + bfqq_wants_to_preempt = + bfq_bfqq_update_budg_for_activation(bfqd, bfqq, + arrived_in_time, + wr_or_deserves_wr); + + /* + * If bfqq happened to be activated in a burst, but has been + * idle for much more than an interactive queue, then we + * assume that, in the overall I/O initiated in the burst, the + * I/O associated with bfqq is finished. So bfqq does not need + * to be treated as a queue belonging to a burst + * anymore. Accordingly, we reset bfqq's in_large_burst flag + * if set, and remove bfqq from the burst list if it's + * there. We do not decrement burst_size, because the fact + * that bfqq does not need to belong to the burst list any + * more does not invalidate the fact that bfqq was created in + * a burst. + */ + if (likely(!bfq_bfqq_just_created(bfqq)) && + idle_for_long_time && + time_is_before_jiffies( + bfqq->budget_timeout + + msecs_to_jiffies(10000))) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + bfq_clear_bfqq_just_created(bfqq); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (arrived_in_time) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + bfq_log_bfqq(bfqd, bfqq, "requests in time %d", + bfqq->requests_within_timer); + } + + if (bfqd->low_latency) { + if (unlikely(time_is_after_jiffies(bfqq->split_time))) + /* wraparound */ + bfqq->split_time = + jiffies - bfqd->bfq_wr_min_idle_time - 1; + + if (time_is_before_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) { + bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, + old_wr_coeff, + wr_or_deserves_wr, + *interactive, + in_burst, + soft_rt); + + if (old_wr_coeff != bfqq->wr_coeff) + bfqq->entity.prio_changed = 1; + } + } + + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + + bfq_add_bfqq_busy(bfqd, bfqq); + + /* + * Expire in-service queue only if preemption may be needed + * for guarantees. In this respect, the function + * next_queue_may_preempt just checks a simple, necessary + * condition, and not a sufficient condition based on + * timestamps. In fact, for the latter condition to be + * evaluated, timestamps would need first to be updated, and + * this operation is quite costly (see the comments on the + * function bfq_bfqq_update_budg_for_activation). + */ + if (bfqd->in_service_queue && bfqq_wants_to_preempt && + bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && + next_queue_may_preempt(bfqd)) { + struct bfq_queue *in_serv = + bfqd->in_service_queue; + BUG_ON(in_serv == bfqq); + + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + BUG_ON(in_serv->entity.budget < 0); + } } static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_entity *entity = &bfqq->entity; struct bfq_data *bfqd = bfqq->bfqd; struct request *next_rq, *prev; - unsigned long old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", + blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); + + if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; elv_rb_add(&bfqq->sort_list, rq); /* - * Check if this request is a better next-serve candidate. + * Check if this request is a better next-to-serve candidate. */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); @@ -886,160 +1423,10 @@ static void bfq_add_request(struct request *rq) if (prev != bfqq->next_rq) bfq_pos_tree_add_move(bfqd, bfqq); - if (!bfq_bfqq_busy(bfqq)) { - bool soft_rt, coop_or_in_burst, - idle_for_long_time = time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, - rq->cmd_flags); -#endif - if (bfq_bfqq_sync(bfqq)) { - bool already_in_burst = - !hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq); - bfq_handle_burst(bfqd, bfqq, idle_for_long_time); - /* - * If bfqq was not already in the current burst, - * then, at this point, bfqq either has been - * added to the current burst or has caused the - * current burst to terminate. In particular, in - * the second case, bfqq has become the first - * queue in a possible new burst. - * In both cases last_ins_in_burst needs to be - * moved forward. - */ - if (!already_in_burst) - bfqd->last_ins_in_burst = jiffies; - } - - coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - !coop_or_in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); - interactive = !coop_or_in_burst && idle_for_long_time; - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (time_before(jiffies, - RQ_BIC(rq)->ttime.last_end_request + - bfqd->bfq_slice_idle)) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - - if (!bfqd->low_latency) - goto add_bfqq_busy; - - if (bfq_bfqq_just_split(bfqq)) - goto set_prio_changed; - - /* - * If the queue: - * - is not being boosted, - * - has been idle for enough time, - * - is not a sync queue or is linked to a bfq_io_cq (it is - * shared "for its nature" or it is not shared and its - * requests have not been redirected to a shared queue) - * start a weight-raising period. - */ - if (old_wr_coeff == 1 && (interactive || soft_rt) && - (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfq_log_bfqq(bfqd, bfqq, - "wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } else if (old_wr_coeff > 1) { - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else if (coop_or_in_burst || - (bfqq->wr_cur_max_time == - bfqd->bfq_wr_rt_max_time && - !soft_rt)) { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq-> - wr_cur_max_time)); - } else if (time_before( - bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time, - jiffies + - bfqd->bfq_wr_rt_max_time) && - soft_rt) { - /* - * - * The remaining weight-raising time is lower - * than bfqd->bfq_wr_rt_max_time, which means - * that the application is enjoying weight - * raising either because deemed soft-rt in - * the near past, or because deemed interactive - * a long ago. - * In both cases, resetting now the current - * remaining weight-raising time for the - * application to the weight-raising duration - * for soft rt applications would not cause any - * latency increase for the application (as the - * new duration would be higher than the - * remaining time). - * - * In addition, the application is now meeting - * the requirements for being deemed soft rt. - * In the end we can correctly and safely - * (re)charge the weight-raising duration for - * the application with the weight-raising - * duration for soft rt applications. - * - * In particular, doing this recharge now, i.e., - * before the weight-raising period for the - * application finishes, reduces the probability - * of the following negative scenario: - * 1) the weight of a soft rt application is - * raised at startup (as for any newly - * created application), - * 2) since the application is not interactive, - * at a certain time weight-raising is - * stopped for the application, - * 3) at that time the application happens to - * still have pending requests, and hence - * is destined to not have a chance to be - * deemed soft rt before these requests are - * completed (see the comments to the - * function bfq_bfqq_softrt_next_start() - * for details on soft rt detection), - * 4) these pending requests experience a high - * latency because the application is not - * weight-raised while they are pending. - */ - bfqq->last_wr_start_finish = jiffies; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - } - } -set_prio_changed: - if (old_wr_coeff != bfqq->wr_coeff) - entity->prio_changed = 1; -add_bfqq_busy: - bfqq->last_idle_bklogged = jiffies; - bfqq->service_from_backlogged = 0; - bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); - } else { + if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ + bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, + rq, &interactive); + else { if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && time_is_before_jiffies( bfqq->last_wr_start_finish + @@ -1048,16 +1435,43 @@ static void bfq_add_request(struct request *rq) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqd->wr_busy_queues++; - entity->prio_changed = 1; + bfqq->entity.prio_changed = 1; bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + "non-idle wrais starting, " + "wr_max_time %u wr_busy %d", + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqd->wr_busy_queues); } if (prev != bfqq->next_rq) bfq_updated_next_req(bfqd, bfqq); } + /* + * Assign jiffies to last_wr_start_finish in the following + * cases: + * + * . if bfqq is not going to be weight-raised, because, for + * non weight-raised queues, last_wr_start_finish stores the + * arrival time of the last request; as of now, this piece + * of information is used only for deciding whether to + * weight-raise async queues + * + * . if bfqq is not weight-raised, because, if bfqq is now + * switching to weight-raised, then last_wr_start_finish + * stores the time when weight-raising starts + * + * . if bfqq is interactive, because, regardless of whether + * bfqq is currently weight-raised, the weight-raising + * period must start or restart (this case is considered + * separately because it is not detected by the above + * conditions, if bfqq is already weight-raised) + * + * last_wr_start_finish has to be updated also if bfqq is soft + * real-time, because the weight-raising period is constantly + * restarted on idle-to-busy transitions for these queues, but + * this is already done in bfq_bfqq_handle_idle_busy_switch if + * needed. + */ if (bfqd->low_latency && (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) bfqq->last_wr_start_finish = jiffies; @@ -1074,22 +1488,32 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, if (!bic) return NULL; - bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); if (bfqq) return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); return NULL; } -static void bfq_activate_request(struct request_queue *q, struct request *rq) +static sector_t get_sdist(sector_t last_pos, struct request *rq) { - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", - (unsigned long long) bfqd->last_position); -} + sector_t sdist = 0; + + if (last_pos) { + if (last_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - last_pos; + else + sdist = last_pos - blk_rq_pos(rq); + } + + return sdist; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + bfqd->rq_in_driver++; +} static void bfq_deactivate_request(struct request_queue *q, struct request *rq) { @@ -1105,6 +1529,9 @@ static void bfq_remove_request(struct request *rq) struct bfq_data *bfqd = bfqq->bfqd; const int sync = rq_is_sync(rq); + BUG_ON(bfqq->entity.service > bfqq->entity.budget && + bfqq == bfqd->in_service_queue); + if (bfqq->next_rq == rq) { bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); bfq_updated_next_req(bfqd, bfqq); @@ -1118,8 +1545,26 @@ static void bfq_remove_request(struct request *rq) elv_rb_del(&bfqq->sort_list, rq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) - bfq_del_bfqq_busy(bfqd, bfqq, 1); + bfqq->next_rq = NULL; + + BUG_ON(bfqq->entity.budget < 0); + + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { + bfq_del_bfqq_busy(bfqd, bfqq, false); + /* bfqq emptied. In normal operation, when + * bfqq is empty, bfqq->entity.service and + * bfqq->entity.budget must contain, + * respectively, the service received and the + * budget used last time bfqq emptied. These + * facts do not hold in this case, as at least + * this last removal occurred while bfqq is + * not in service. To avoid inconsistencies, + * reset both bfqq->entity.service and + * bfqq->entity.budget. + */ + bfqq->entity.budget = bfqq->entity.service = 0; + } + /* * Remove queue from request-position tree as it is empty. */ @@ -1133,9 +1578,7 @@ static void bfq_remove_request(struct request *rq) BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -#endif } static int bfq_merge(struct request_queue *q, struct request **req, @@ -1145,7 +1588,7 @@ static int bfq_merge(struct request_queue *q, struct request **req, struct request *__rq; __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq && elv_rq_merge_ok(__rq, bio)) { + if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; } @@ -1190,7 +1633,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, static void bfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); + bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); } #endif @@ -1210,7 +1653,7 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, */ if (bfqq == next_bfqq && !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - time_before(next->fifo_time, rq->fifo_time)) { + next->fifo_time < rq->fifo_time) { list_del_init(&rq->queuelist); list_replace_init(&next->queuelist, &rq->queuelist); rq->fifo_time = next->fifo_time; @@ -1220,21 +1663,30 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(next); -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -#endif } /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { BUG_ON(!bfqq); + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; - /* Trigger a weight change on the next activation of the queue */ + bfqq->last_wr_start_finish = jiffies; + /* + * Trigger a weight change on the next invocation of + * __bfq_entity_update_weight_prio. + */ bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "end_wr: wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", + bfqq->bfqd->wr_busy_queues); } static void bfq_end_wr_async_queues(struct bfq_data *bfqd, @@ -1277,7 +1729,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, sector_t sector) { return abs(bfq_io_struct_pos(io_struct, request) - sector) <= - BFQQ_SEEK_THR; + BFQQ_CLOSE_THR; } static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, @@ -1399,7 +1851,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * throughput. */ bfqq->new_bfqq = new_bfqq; - atomic_add(process_refs, &new_bfqq->ref); + new_bfqq->ref += process_refs; return new_bfqq; } @@ -1430,9 +1882,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, } /* - * Attempt to schedule a merge of bfqq with the currently in-service queue - * or with a close queue among the scheduled queues. - * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * If this function returns true, then bfqq cannot be merged. The idea + * is that true cooperation happens very early after processes start + * to do I/O. Usually, late cooperations are just accidental false + * positives. In case bfqq is weight-raised, such false positives + * would evidently degrade latency guarantees for bfqq. + */ +static bool wr_from_too_long(struct bfq_queue *bfqq) +{ + return bfqq->wr_coeff > 1 && + time_is_before_jiffies(bfqq->last_wr_start_finish + + msecs_to_jiffies(100)); +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return + * NULL if no merge was scheduled, a pointer to the shared bfq_queue * structure otherwise. * * The OOM queue is not allowed to participate to cooperation: in fact, since @@ -1441,6 +1907,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, * handle merging with the OOM queue would be quite complex and expensive * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. + * + * Weight-raised queues can be merged only if their weight-raising + * period has just started. In fact cooperating processes are usually + * started together. Thus, with this filter we avoid false positives + * that would jeopardize low-latency guarantees. + * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the + * weight the same, a merged queue may be bloated with many more + * requests than the ones produced by its originally-associated + * process. */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -1450,16 +1928,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + + if (io_struct && wr_from_too_long(bfqq) && + likely(bfqq != &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have looked for coop, but bfq%d wr", + bfqq->pid); + + if (!io_struct || + wr_from_too_long(bfqq) || + unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; - /* If device has only one backlogged bfq_queue, don't search. */ + + /* If there is only one backlogged queue, don't search. */ if (bfqd->busy_queues == 1) return NULL; in_service_bfqq = bfqd->in_service_queue; + if (in_service_bfqq && in_service_bfqq != bfqq && + bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) + && likely(in_service_bfqq == &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have tried merge with in-service-queue, but wr"); + if (!in_service_bfqq || in_service_bfqq == bfqq || - !bfqd->in_service_bic || + !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || unlikely(in_service_bfqq == &bfqd->oom_bfqq)) goto check_scheduled; @@ -1481,7 +1975,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have merged with bfq%d, but wr", + new_bfqq->pid); + + if (new_bfqq && !wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); @@ -1490,53 +1992,25 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { + struct bfq_io_cq *bic = bfqq->bic; + /* * If !bfqq->bic, the queue is already shared or its requests * have already been redirected to a shared queue; both idle window * and weight raising state have already been saved. Do nothing. */ - if (!bfqq->bic) + if (!bic) return; - if (bfqq->bic->wr_time_left) - /* - * This is the queue of a just-started process, and would - * deserve weight raising: we set wr_time_left to the full - * weight-raising duration to trigger weight-raising when - * and if the queue is split and the first request of the - * queue is enqueued. - */ - bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); - else if (bfqq->wr_coeff > 1) { - unsigned long wr_duration = - jiffies - bfqq->last_wr_start_finish; - /* - * It may happen that a queue's weight raising period lasts - * longer than its wr_cur_max_time, as weight raising is - * handled only when a request is enqueued or dispatched (it - * does not use any timer). If the weight raising period is - * about to end, don't save it. - */ - if (bfqq->wr_cur_max_time <= wr_duration) - bfqq->bic->wr_time_left = 0; - else - bfqq->bic->wr_time_left = - bfqq->wr_cur_max_time - wr_duration; - /* - * The bfq_queue is becoming shared or the requests of the - * process owning the queue are being redirected to a shared - * queue. Stop the weight raising period of the queue, as in - * both cases it should not be owned by an interactive or - * soft real-time application. - */ - bfq_bfqq_end_wr(bfqq); - } else - bfqq->bic->wr_time_left = 0; - bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); - bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bfqq->bic->cooperations++; - bfqq->bic->failed_cooperations = 0; + + bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); } static void bfq_get_bic_reference(struct bfq_queue *bfqq) @@ -1561,6 +2035,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, if (bfq_bfqq_IO_bound(bfqq)) bfq_mark_bfqq_IO_bound(new_bfqq); bfq_clear_bfqq_IO_bound(bfqq); + + /* + * If bfqq is weight-raised, then let new_bfqq inherit + * weight-raising. To reduce false positives, neglect the case + * where bfqq has just been created, but has not yet made it + * to be weight-raised (which may happen because EQM may merge + * bfqq even before bfq_add_request is executed for the first + * time for bfqq). Handling this case would however be very + * easy, thanks to the flag just_created. + */ + if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { + new_bfqq->wr_coeff = bfqq->wr_coeff; + new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; + new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + if (bfq_bfqq_busy(new_bfqq)) + bfqd->wr_busy_queues++; + new_bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, new_bfqq, + "wr start after merge with %d, rais_max_time %u", + bfqq->pid, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ + bfqq->wr_coeff = 1; + bfqq->entity.prio_changed = 1; + if (bfq_bfqq_busy(bfqq)) + bfqd->wr_busy_queues--; + } + + bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", + bfqd->wr_busy_queues); + /* * Grab a reference to the bic, to prevent it from being destroyed * before being possibly touched by a bfq_split_bfqq(). @@ -1587,30 +2095,19 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } -static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) -{ - struct bfq_io_cq *bic = bfqq->bic; - struct bfq_data *bfqd = bfqq->bfqd; - - if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { - bic->failed_cooperations++; - if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) - bic->cooperations = 0; - } -} - -static int bfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) +static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); struct bfq_io_cq *bic; struct bfq_queue *bfqq, *new_bfqq; /* * Disallow merge of a sync bio into an async request. */ - if (bfq_bio_sync(bio) && !rq_is_sync(rq)) - return 0; + if (is_sync && !rq_is_sync(rq)) + return false; /* * Lookup the bfqq that this bio will be queued with. Allow @@ -1619,9 +2116,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, */ bic = bfq_bic_lookup(bfqd, current->io_context); if (!bic) - return 0; + return false; - bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + bfqq = bic_to_bfqq(bic, is_sync); /* * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. @@ -1636,30 +2133,111 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * to decide whether bio and rq can be merged. */ bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); + } } return bfqq == RQ_BFQQ(rq); } +static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + return RQ_BFQQ(rq) == RQ_BFQQ(next); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the throughput. + * In practice, a time-slice service scheme is used with seeky + * processes. + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + unsigned int timeout_coeff; + + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); +} + static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq) { -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -#endif bfq_mark_bfqq_must_alloc(bfqq); - bfq_mark_bfqq_budget_new(bfqq); bfq_clear_bfqq_fifo_expire(bfqq); bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + BUG_ON(bfqq == bfqd->in_service_queue); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (time_is_before_jiffies(bfqq->last_wr_start_finish) && + bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_before_jiffies(bfqq->budget_timeout)) { + /* + * For soft real-time queues, move the start + * of the weight-raising period forward by the + * time the queue has not received any + * service. Otherwise, a relatively long + * service delay is likely to cause the + * weight-raising period of the queue to end, + * because of the short duration of the + * weight-raising period of a soft real-time + * queue. It is worth noting that this move + * is not so dangerous for the other queues, + * because soft real-time queues are not + * greedy. + * + * To not add a further variable, we use the + * overloaded field budget_timeout to + * determine for how long the queue has not + * received service, i.e., how much time has + * elapsed since the queue expired. However, + * this is a little imprecise, because + * budget_timeout is set to jiffies if bfqq + * not only expires, but also remains with no + * request. + */ + if (time_after(bfqq->budget_timeout, + bfqq->last_wr_start_finish)) + bfqq->last_wr_start_finish += + jiffies - bfqq->budget_timeout; + else + bfqq->last_wr_start_finish = jiffies; + + if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { + pr_crit( + "BFQ WARNING:last %lu budget %lu jiffies %lu", + bfqq->last_wr_start_finish, + bfqq->budget_timeout, + jiffies); + pr_crit("diff %lu", jiffies - + max_t(unsigned long, + bfqq->last_wr_start_finish, + bfqq->budget_timeout)); + bfqq->last_wr_start_finish = jiffies; + } + } + + bfq_set_budget_timeout(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, "set_in_service_queue, cur-budget = %d", bfqq->entity.budget); - } + } else + bfq_log(bfqd, "set_in_service_queue: NULL"); bfqd->in_service_queue = bfqq; } @@ -1675,36 +2253,11 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) return bfqq; } -/* - * If enough samples have been computed, return the current max budget - * stored in bfqd, which is dynamically updated according to the - * estimated disk peak rate; otherwise return the default max budget - */ -static int bfq_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget; -} - -/* - * Return min budget, which is a fraction of the current or default - * max budget (trying with 1/32) - */ -static int bfq_min_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget / 32; - else - return bfqd->bfq_max_budget / 32; -} - static void bfq_arm_slice_timer(struct bfq_data *bfqd) { struct bfq_queue *bfqq = bfqd->in_service_queue; struct bfq_io_cq *bic; - unsigned long sl; + u32 sl; BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -1728,119 +2281,366 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) sl = bfqd->bfq_slice_idle; /* * Unless the queue is being weight-raised or the scenario is - * asymmetric, grant only minimum idle time if the queue either - * has been seeky for long enough or has already proved to be - * constantly seeky. + * asymmetric, grant only minimum idle time if the queue + * is seeky. A long idling is preserved for a weight-raised + * queue, or, more in general, in an asymemtric scenario, + * because a long idling is needed for guaranteeing to a queue + * its reserved share of the throughput (in particular, it is + * needed if the queue has a higher weight than some other + * queue). */ - if (bfq_sample_valid(bfqq->seek_samples) && - ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > - bfq_max_budget(bfqq->bfqd) / 8) || - bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && + if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && bfq_symmetric_scenario(bfqd)) - sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); - else if (bfqq->wr_coeff > 1) - sl = sl * 3; + sl = min_t(u32, sl, BFQ_MIN_TT); + bfqd->last_idling_start = ktime_get(); - mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -#ifdef CONFIG_BFQ_GROUP_IOSCHED + hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), + HRTIMER_MODE_REL); bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -#endif - bfq_log(bfqd, "arm idle: %u/%u ms", - jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); + bfq_log(bfqd, "arm idle: %ld/%ld ms", + sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); } /* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the disk - * throughput (always guaranteed with a time slice scheme as in CFQ). + * In autotuning mode, max_budget is dynamically recomputed as the + * amount of sectors transferred in timeout at the estimated peak + * rate. This enables BFQ to utilize a full timeslice with a full + * budget, even if the in-service queue is served at peak rate. And + * this maximises throughput with sequential workloads. */ -static void bfq_set_budget_timeout(struct bfq_data *bfqd) +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) { - struct bfq_queue *bfqq = bfqd->in_service_queue; - unsigned int timeout_coeff; + return (u64)bfqd->peak_rate * USEC_PER_MSEC * + jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; +} - if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) - timeout_coeff = 1; - else - timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; +/* + * Update parameters related to throughput and responsiveness, as a + * function of the estimated peak rate. See comments on + * bfq_calc_max_budget(), and on T_slow and T_fast arrays. + */ +static void update_thr_responsiveness_params(struct bfq_data *bfqd) +{ + int dev_type = blk_queue_nonrot(bfqd->queue); + + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd); + BUG_ON(bfqd->bfq_max_budget < 0); + bfq_log(bfqd, "new max_budget = %d", + bfqd->bfq_max_budget); + } - bfqd->last_budget_start = ktime_get(); + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } - bfq_clear_bfqq_budget_new(bfqq); - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; + bfq_log(bfqd, +"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", + dev_type == 0 ? "ROT" : "NONROT", + bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", + bfqd->device_speed == BFQ_BFQD_FAST ? + (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : + (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> + BFQ_RATE_SHIFT); +} - bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", - jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * - timeout_coeff)); +static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) +{ + if (rq != NULL) { /* new rq dispatch now, reset accordingly */ + bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; + bfqd->peak_rate_samples = 1; + bfqd->sequential_samples = 0; + bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = + blk_rq_sectors(rq); + } else /* no new rq dispatched, just reset the number of samples */ + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, + "reset_rate_computation at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); } -/* - * Move request from internal lists to the request queue dispatch list. - */ -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); + u32 rate, weight, divisor; /* - * For consistency, the next instruction should have been executed - * after removing the request from the queue and dispatching it. - * We execute instead this instruction before bfq_remove_request() - * (and hence introduce a temporary inconsistency), for efficiency. - * In fact, in a forced_dispatch, this prevents two counters related - * to bfqq->dispatched to risk to be uselessly decremented if bfqq - * is not in service, and then to be incremented again after - * incrementing bfqq->dispatched. + * For the convergence property to hold (see comments on + * bfq_update_peak_rate()) and for the assessment to be + * reliable, a minimum number of samples must be present, and + * a minimum amount of time must have elapsed. If not so, do + * not compute new rate. Just reset parameters, to get ready + * for a new evaluation attempt. */ - bfqq->dispatched++; - bfq_remove_request(rq); - elv_dispatch_sort(q, rq); + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, + "update_rate_reset: only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight++; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), - rq->cmd_flags); -#endif + /* + * If a new request completion has occurred after last + * dispatch, then, to approximate the rate at which requests + * have been served by the device, it is more precise to + * extend the observation interval to the last completion. + */ + bfqd->delta_from_first = + max_t(u64, bfqd->delta_from_first, + bfqd->last_completion - bfqd->first_dispatch); + + BUG_ON(bfqd->delta_from_first == 0); + /* + * Rate computed in sects/usec, and not sects/nsec, for + * precision issues. + */ + rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20< 20M sectors/sec) + */ + if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && + rate <= bfqd->peak_rate) || + rate > 20<peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, + "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + } + + /* + * We have to update the peak rate, at last! To this purpose, + * we use a low-pass filter. We compute the smoothing constant + * of the filter as a function of the 'weight' of the new + * measured rate. + * + * As can be seen in next formulas, we define this weight as a + * quantity proportional to how sequential the workload is, + * and to how long the observation time interval is. + * + * The weight runs from 0 to 8. The maximum value of the + * weight, 8, yields the minimum value for the smoothing + * constant. At this minimum value for the smoothing constant, + * the measured rate contributes for half of the next value of + * the estimated peak rate. + * + * So, the first step is to compute the weight as a function + * of how sequential the workload is. Note that the weight + * cannot reach 9, because bfqd->sequential_samples cannot + * become equal to bfqd->peak_rate_samples, which, in its + * turn, holds true because bfqd->sequential_samples is not + * incremented for the first sample. + */ + weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; + + /* + * Second step: further refine the weight as a function of the + * duration of the observation interval. + */ + weight = min_t(u32, 8, + div_u64(weight * bfqd->delta_from_first, + BFQ_RATE_REF_INTERVAL)); + + /* + * Divisor ranging from 10, for minimum weight, to 2, for + * maximum weight. + */ + divisor = 10 - weight; + BUG_ON(divisor == 0); + + /* + * Finally, update peak rate: + * + * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor + */ + bfqd->peak_rate *= divisor-1; + bfqd->peak_rate /= divisor; + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, + "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); + + BUG_ON(bfqd->peak_rate == 0); + BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, + "update_peak_rate: goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ + } - if (bfq_bfqq_fifo_expire(bfqq)) - return NULL; + /* + * Device idle for very long: the observation interval lasting + * up to this dispatch cannot be a valid observation interval + * for computing a new peak rate (similarly to the late- + * completion event in bfq_completed_request()). Go to + * update_rate_and_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; + } - bfq_mark_bfqq_fifo_expire(bfqq); + /* Update sampling information */ + bfqd->peak_rate_samples++; - if (list_empty(&bfqq->fifo)) - return NULL; + if ((bfqd->rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) + bfqd->sequential_samples++; - rq = rq_entry_fifo(bfqq->fifo.next); + bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); - if (time_before(jiffies, rq->fifo_time)) - return NULL; + /* Reset max observed rq size every 32 dispatches */ + if (likely(bfqd->peak_rate_samples % 32)) + bfqd->last_rq_max_size = + max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); + else + bfqd->last_rq_max_size = blk_rq_sectors(rq); - return rq; + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + bfq_log(bfqd, + "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); + + /* Target observation interval not yet reached, go on sampling */ + if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) + goto update_last_values; + +update_rate_and_reset: + bfq_update_rate_reset(bfqd, rq); +update_last_values: + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, + "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, + "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); } -static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +/* + * Move request from internal lists to the dispatch list of the request queue + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) { - struct bfq_entity *entity = &bfqq->entity; + struct bfq_queue *bfqq = RQ_BFQQ(rq); - return entity->budget - entity->service; + /* + * For consistency, the next instruction should have been executed + * after removing the request from the queue and dispatching it. + * We execute instead this instruction before bfq_remove_request() + * (and hence introduce a temporary inconsistency), for efficiency. + * In fact, in a forced_dispatch, this prevents two counters related + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); + + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); } static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) { BUG_ON(bfqq != bfqd->in_service_queue); - __bfq_bfqd_reset_in_service(bfqd); - /* * If this bfqq is shared between multiple processes, check * to make sure that those processes are still issuing I/Os @@ -1851,20 +2651,30 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_split_coop(bfqq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * Overloading budget_timeout field to store the time - * at which the queue remains with no backlog; used by - * the weight-raising mechanism. - */ - bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, 1); + if (bfqq->dispatched == 0) + /* + * Overloading budget_timeout field to store + * the time at which the queue remains with no + * backlog and no outstanding request; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + + bfq_del_bfqq_busy(bfqd, bfqq, true); } else { - bfq_activate_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq); /* * Resort priority tree of potential close cooperators. */ bfq_pos_tree_add_move(bfqd, bfqq); } + + /* + * All in-service entities must have been properly deactivated + * or requeued before executing the next function, which + * resets all in-service entites as no more in service. + */ + __bfq_bfqd_reset_in_service(bfqd); } /** @@ -1883,10 +2693,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, struct request *next_rq; int budget, min_budget; - budget = bfqq->max_budget; + BUG_ON(bfqq != bfqd->in_service_queue); + min_budget = bfq_min_budget(bfqd); - BUG_ON(bfqq != bfqd->in_service_queue); + if (bfqq->wr_coeff == 1) + budget = bfqq->max_budget; + else /* + * Use a constant, low budget for weight-raised queues, + * to help achieve a low latency. Keep it slightly higher + * than the minimum possible budget, to cause a little + * bit fewer expirations. + */ + budget = 2 * min_budget; bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); @@ -1895,7 +2714,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - if (bfq_bfqq_sync(bfqq)) { + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { switch (reason) { /* * Caveat: in all the following cases we trade latency @@ -1937,14 +2756,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, break; case BFQ_BFQQ_BUDGET_TIMEOUT: /* - * We double the budget here because: 1) it - * gives the chance to boost the throughput if - * this is not a seeky process (which may have - * bumped into this timeout because of, e.g., - * ZBR), 2) together with charge_full_budget - * it helps give seeky processes higher - * timestamps, and hence be served less - * frequently. + * We double the budget here because it gives + * the chance to boost the throughput if this + * is not a seeky process (and has bumped into + * this timeout because of, e.g., ZBR). */ budget = min(budget * 2, bfqd->bfq_max_budget); break; @@ -1961,17 +2776,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, budget = min(budget * 4, bfqd->bfq_max_budget); break; case BFQ_BFQQ_NO_MORE_REQUESTS: - /* - * Leave the budget unchanged. - */ + /* + * For queues that expire for this reason, it + * is particularly important to keep the + * budget close to the actual service they + * need. Doing so reduces the timestamp + * misalignment problem described in the + * comments in the body of + * __bfq_activate_entity. In fact, suppose + * that a queue systematically expires for + * BFQ_BFQQ_NO_MORE_REQUESTS and presents a + * new request in time to enjoy timestamp + * back-shifting. The larger the budget of the + * queue is with respect to the service the + * queue actually requests in each service + * slot, the more times the queue can be + * reactivated with the same virtual finish + * time. It follows that, even if this finish + * time is pushed to the system virtual time + * to reduce the consequent timestamp + * misalignment, the queue unjustly enjoys for + * many re-activations a lower finish time + * than all newly activated queues. + * + * The service needed by bfqq is measured + * quite precisely by bfqq->entity.service. + * Since bfqq does not enjoy device idling, + * bfqq->entity.service is equal to the number + * of sectors that the process associated with + * bfqq requested to read/write before waiting + * for request completions, or blocking for + * other reasons. + */ + budget = max_t(int, bfqq->entity.service, min_budget); + break; default: return; } - } else + } else if (!bfq_bfqq_sync(bfqq)) /* - * Async queues get always the maximum possible budget - * (their ability to dispatch is limited by - * @bfqd->bfq_max_budget_async_rq). + * Async queues get always the maximum possible + * budget, as for them we do not care about latency + * (in addition, their ability to dispatch is limited + * by the charging factor). */ budget = bfqd->bfq_max_budget; @@ -1982,160 +2829,120 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); /* - * Make sure that we have enough budget for the next request. - * Since the finish time of the bfqq must be kept in sync with - * the budget, be sure to call __bfq_bfqq_expire() after the + * If there is still backlog, then assign a new budget, making + * sure that it is large enough for the next request. Since + * the finish time of bfqq must be kept in sync with the + * budget, be sure to call __bfq_bfqq_expire() *after* this * update. + * + * If there is no backlog, then no need to update the budget; + * it will be updated on the arrival of a new request. */ next_rq = bfqq->next_rq; - if (next_rq) + if (next_rq) { + BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || + reason == BFQ_BFQQ_NO_MORE_REQUESTS); bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); - else - bfqq->entity.budget = bfqq->max_budget; + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + } bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", next_rq ? blk_rq_sectors(next_rq) : 0, bfqq->entity.budget); } -static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) -{ - unsigned long max_budget; - - /* - * The max_budget calculated when autotuning is equal to the - * amount of sectors transfered in timeout_sync at the - * estimated peak rate. - */ - max_budget = (unsigned long)(peak_rate * 1000 * - timeout >> BFQ_RATE_SHIFT); - - return max_budget; -} - /* - * In addition to updating the peak rate, checks whether the process - * is "slow", and returns 1 if so. This slow flag is used, in addition - * to the budget timeout, to reduce the amount of service provided to - * seeky processes, and hence reduce their chances to lower the - * throughput. See the code for more details. + * Return true if the process associated with bfqq is "slow". The slow + * flag is used, in addition to the budget timeout, to reduce the + * amount of service provided to seeky processes, and thus reduce + * their chances to lower the throughput. More details in the comments + * on the function bfq_bfqq_expire(). + * + * An important observation is in order: as discussed in the comments + * on the function bfq_update_peak_rate(), with devices with internal + * queues, it is hard if ever possible to know when and for how long + * an I/O request is processed by the device (apart from the trivial + * I/O pattern where a new request is dispatched only after the + * previous one has been completed). This makes it hard to evaluate + * the real rate at which the I/O requests of each bfq_queue are + * served. In fact, for an I/O scheduler like BFQ, serving a + * bfq_queue means just dispatching its requests during its service + * slot (i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires). But, during the + * service slot of a bfq_queue, around 100 ms at most, the device may + * be even still processing requests of bfq_queues served in previous + * service slots. On the opposite end, the requests of the in-service + * bfq_queue may be completed after the service slot of the queue + * finishes. + * + * Anyway, unless more sophisticated solutions are used + * (where possible), the sum of the sizes of the requests dispatched + * during the service slot of a bfq_queue is probably the only + * approximation available for the service received by the bfq_queue + * during its service slot. And this sum is the quantity used in this + * function to evaluate the I/O speed of a process. */ -static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason) +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason, + unsigned long *delta_ms) { - u64 bw, usecs, expected, timeout; - ktime_t delta; - int update = 0; + ktime_t delta_ktime; + u32 delta_usecs; + bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ - if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + if (!bfq_bfqq_sync(bfqq)) return false; if (compensate) - delta = bfqd->last_idling_start; + delta_ktime = bfqd->last_idling_start; else - delta = ktime_get(); - delta = ktime_sub(delta, bfqd->last_budget_start); - usecs = ktime_to_us(delta); - - /* Don't trust short/unrealistic values. */ - if (usecs < 100 || usecs >= LONG_MAX) - return false; - - /* - * Calculate the bandwidth for the last slice. We use a 64 bit - * value to store the peak rate, in sectors per usec in fixed - * point math. We do so to have enough precision in the estimate - * and to avoid overflows. - */ - bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; - do_div(bw, (unsigned long)usecs); + delta_ktime = ktime_get(); + delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); + delta_usecs = ktime_to_us(delta_ktime); + + /* don't trust short/unrealistic values. */ + if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { + if (blk_queue_nonrot(bfqd->queue)) + /* + * give same worst-case guarantees as idling + * for seeky + */ + *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + + bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs); + + return slow; + } - timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + *delta_ms = delta_usecs / USEC_PER_MSEC; /* - * Use only long (> 20ms) intervals to filter out spikes for - * the peak rate estimation. + * Use only long (> 20ms) intervals to filter out excessive + * spikes in service rate estimation. */ - if (usecs > 20000) { - if (bw > bfqd->peak_rate || - (!BFQQ_SEEKY(bfqq) && - reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { - bfq_log(bfqd, "measured bw =%llu", bw); - /* - * To smooth oscillations use a low-pass filter with - * alpha=7/8, i.e., - * new_rate = (7/8) * old_rate + (1/8) * bw - */ - do_div(bw, 8); - if (bw == 0) - return 0; - bfqd->peak_rate *= 7; - do_div(bfqd->peak_rate, 8); - bfqd->peak_rate += bw; - update = 1; - bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); - } - - update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; - - if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) - bfqd->peak_rate_samples++; - - if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && - update) { - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd->peak_rate, - timeout); - bfq_log(bfqd, "new max_budget=%d", - bfqd->bfq_max_budget); - } - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; - } - } + if (delta_usecs > 20000) { + /* + * Caveat for rotational devices: processes doing I/O + * in the slower disk zones tend to be slow(er) even + * if not seeky. In this respect, the estimated peak + * rate is likely to be an average over the disk + * surface. Accordingly, to not be too harsh with + * unlucky processes, a process is deemed slow only if + * its rate has been lower than half of the estimated + * peak rate. + */ + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; + bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); } - /* - * If the process has been served for a too short time - * interval to let its possible sequential accesses prevail on - * the initial seek time needed to move the disk head on the - * first sector it requested, then give the process a chance - * and for the moment return false. - */ - if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) - return false; - - /* - * A process is considered ``slow'' (i.e., seeky, so that we - * cannot treat it fairly in the service domain, as it would - * slow down too much the other processes) if, when a slice - * ends for whatever reason, it has received service at a - * rate that would not be high enough to complete the budget - * before the budget timeout expiration. - */ - expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); - /* - * Caveat: processes doing IO in the slower disk zones will - * tend to be slow(er) even if not seeky. And the estimated - * peak rate will actually be an average over the disk - * surface. Hence, to not be too harsh with unlucky processes, - * we keep a budget/3 margin of safety before declaring a - * process slow. - */ - return expected > (4 * bfqq->entity.budget) / 3; + return slow; } /* @@ -2193,20 +3000,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + bfq_log_bfqq(bfqd, bfqq, +"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", + bfqq->service_from_backlogged, + bfqd->bfq_wr_max_softrt_rate, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate)); + return max(bfqq->last_idle_bklogged + HZ * bfqq->service_from_backlogged / bfqd->bfq_wr_max_softrt_rate, - jiffies + bfqq->bfqd->bfq_slice_idle + 4); + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } /* - * Return the largest-possible time instant such that, for as long as possible, - * the current time will be lower than this time instant according to the macro - * time_is_before_jiffies(). + * Return the farthest future time instant according to jiffies + * macros. */ -static unsigned long bfq_infinity_from_now(unsigned long now) +static unsigned long bfq_greatest_from_now(void) { - return now + ULONG_MAX / 2; + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; } /** @@ -2216,28 +3038,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now) * @compensate: if true, compensate for the time spent idling. * @reason: the reason causing the expiration. * + * If the process associated with bfqq does slow I/O (e.g., because it + * issues random requests), we charge bfqq with the time it has been + * in service instead of the service it has received (see + * bfq_bfqq_charge_time for details on how this goal is achieved). As + * a consequence, bfqq will typically get higher timestamps upon + * reactivation, and hence it will be rescheduled as if it had + * received more service than what it has actually received. In the + * end, bfqq receives less service in proportion to how slowly its + * associated process consumes its budgets (and hence how seriously it + * tends to lower the throughput). In addition, this time-charging + * strategy guarantees time fairness among slow processes. In + * contrast, if the process associated with bfqq is not slow, we + * charge bfqq exactly with the service it has received. * - * If the process associated to the queue is slow (i.e., seeky), or in - * case of budget timeout, or, finally, if it is async, we - * artificially charge it an entire budget (independently of the - * actual service it received). As a consequence, the queue will get - * higher timestamps than the correct ones upon reactivation, and - * hence it will be rescheduled as if it had received more service - * than what it actually received. In the end, this class of processes - * will receive less service in proportion to how slowly they consume - * their budgets (and hence how seriously they tend to lower the - * throughput). - * - * In contrast, when a queue expires because it has been idling for - * too much or because it exhausted its budget, we do not touch the - * amount of service it has received. Hence when the queue will be - * reactivated and its timestamps updated, the latter will be in sync - * with the actual service received by the queue until expiration. - * - * Charging a full budget to the first type of queues and the exact - * service to the others has the effect of using the WF2Q+ policy to - * schedule the former on a timeslice basis, without violating the - * service domain guarantees of the latter. + * Charging time to the first type of queues and the exact service to + * the other has the effect of using the WF2Q+ policy to schedule the + * former on a timeslice basis, without violating service domain + * guarantees among the latter. */ static void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -2245,41 +3063,52 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, enum bfqq_expiration reason) { bool slow; + unsigned long delta = 0; + struct bfq_entity *entity = &bfqq->entity; BUG_ON(bfqq != bfqd->in_service_queue); /* - * Update disk peak rate for autotuning and check whether the - * process is slow (see bfq_update_peak_rate). + * Check whether the process is slow (see bfq_bfqq_is_slow). + */ + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + + /* + * Increase service_from_backlogged before next statement, + * because the possible next invocation of + * bfq_bfqq_charge_time would likely inflate + * entity->service. In contrast, service_from_backlogged must + * contain real service, to enable the soft real-time + * heuristic to correctly compute the bandwidth consumed by + * bfqq. */ - slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + bfqq->service_from_backlogged += entity->service; /* - * As above explained, 'punish' slow (i.e., seeky), timed-out - * and async queues, to favor sequential sync workloads. + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service + * received, to favor sequential workloads. * - * Processes doing I/O in the slower disk zones will tend to be - * slow(er) even if not seeky. Hence, since the estimated peak - * rate is actually an average over the disk surface, these - * processes may timeout just for bad luck. To avoid punishing - * them we do not charge a full budget to a process that - * succeeded in consuming at least 2/3 of its budget. + * Processes doing I/O in the slower disk zones will tend to + * be slow(er) even if not seeky. Therefore, since the + * estimated peak rate is actually an average over the disk + * surface, these processes may timeout just for bad luck. To + * avoid punishing them, do not charge time to processes that + * succeeded in consuming at least 2/3 of their budget. This + * allows BFQ to preserve enough elasticity to still perform + * bandwidth, and not time, distribution with little unlucky + * or quasi-sequential processes. */ - if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) - bfq_bfqq_charge_full_budget(bfqq); - - bfqq->service_from_backlogged += bfqq->entity.service; + if (bfqq->wr_coeff == 1 && + (slow || + (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) + bfq_bfqq_charge_time(bfqd, bfqq, delta); - if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && - !bfq_bfqq_constantly_seeky(bfqq)) { - bfq_mark_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) - bfqd->const_seeky_busy_in_flight_queues++; - } + BUG_ON(bfqq->entity.budget < bfqq->entity.service); if (reason == BFQ_BFQQ_TOO_IDLE && - bfqq->entity.service <= 2 * bfqq->entity.budget / 10) + entity->service <= 2 * entity->budget / 10) bfq_clear_bfqq_IO_bound(bfqq); if (bfqd->low_latency && bfqq->wr_coeff == 1) @@ -2288,19 +3117,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) { /* - * If we get here, and there are no outstanding requests, - * then the request pattern is isochronous (see the comments - * to the function bfq_bfqq_softrt_next_start()). Hence we - * can compute soft_rt_next_start. If, instead, the queue - * still has outstanding requests, then we have to wait - * for the completion of all the outstanding requests to + * If we get here, and there are no outstanding + * requests, then the request pattern is isochronous + * (see the comments on the function + * bfq_bfqq_softrt_next_start()). Thus we can compute + * soft_rt_next_start. If, instead, the queue still + * has outstanding requests, then we have to wait for + * the completion of all the outstanding requests to * discover whether the request pattern is actually * isochronous. */ - if (bfqq->dispatched == 0) + BUG_ON(bfqd->busy_queues < 1); + if (bfqq->dispatched == 0) { bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); - else { + bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", + bfqq->soft_rt_next_start); + } else { /* * The application is still waiting for the * completion of one or more requests: @@ -2317,7 +3150,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * happened to be in the past. */ bfqq->soft_rt_next_start = - bfq_infinity_from_now(jiffies); + bfq_greatest_from_now(); /* * Schedule an update of soft_rt_next_start to when * the task may be discovered to be isochronous. @@ -2327,15 +3160,27 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, - slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", + reason, slow, bfqq->dispatched, + bfq_bfqq_idle_window(bfqq), entity->weight); /* * Increase, decrease or leave budget unchanged according to * reason. */ + BUG_ON(bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_expire(bfqd, bfqq); + + BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && + !bfq_class_idle(bfqq)); + + if (!bfq_bfqq_busy(bfqq) && + reason != BFQ_BFQQ_BUDGET_TIMEOUT && + reason != BFQ_BFQQ_BUDGET_EXHAUSTED) + bfq_mark_bfqq_non_blocking_wait_rq(bfqq); } /* @@ -2345,20 +3190,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, */ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) { - if (bfq_bfqq_budget_new(bfqq) || - time_before(jiffies, bfqq->budget_timeout)) - return false; - return true; + return time_is_before_eq_jiffies(bfqq->budget_timeout); } /* - * If we expire a queue that is waiting for the arrival of a new - * request, we may prevent the fictitious timestamp back-shifting that - * allows the guarantees of the queue to be preserved (see [1] for - * this tricky aspect). Hence we return true only if this condition - * does not hold, or if the queue is slow enough to deserve only to be - * kicked off for preserving a high throughput. -*/ + * If we expire a queue that is actively waiting (i.e., with the + * device idled) for the arrival of a new request, then we may incur + * the timestamp misalignment problem described in the body of the + * function __bfq_activate_entity. Hence we return true only if this + * condition does not hold, or if the queue is slow enough to deserve + * only to be kicked off for preserving a high throughput. + */ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, @@ -2400,10 +3242,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool idling_boosts_thr, idling_boosts_thr_without_issues, - all_queues_seeky, on_hdd_and_not_all_queues_seeky, idling_needed_for_service_guarantees, asymmetric_scenario; + if (bfqd->strict_guarantees) + return true; + /* * The next variable takes into account the cases where idling * boosts the throughput. @@ -2466,74 +3310,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfqd->wr_busy_queues == 0; /* - * There are then two cases where idling must be performed not + * There is then a case where idling must be performed not * for throughput concerns, but to preserve service - * guarantees. In the description of these cases, we say, for - * short, that a queue is sequential/random if the process - * associated to the queue issues sequential/random requests - * (in the second case the queue may be tagged as seeky or - * even constantly_seeky). - * - * To introduce the first case, we note that, since - * bfq_bfqq_idle_window(bfqq) is false if the device is - * NCQ-capable and bfqq is random (see - * bfq_update_idle_window()), then, from the above two - * assignments it follows that - * idling_boosts_thr_without_issues is false if the device is - * NCQ-capable and bfqq is random. Therefore, for this case, - * device idling would never be allowed if we used just - * idling_boosts_thr_without_issues to decide whether to allow - * it. And, beneficially, this would imply that throughput - * would always be boosted also with random I/O on NCQ-capable - * HDDs. + * guarantees. * - * But we must be careful on this point, to avoid an unfair - * treatment for bfqq. In fact, because of the same above - * assignments, idling_boosts_thr_without_issues is, on the - * other hand, true if 1) the device is an HDD and bfqq is - * sequential, and 2) there are no busy weight-raised - * queues. As a consequence, if we used just - * idling_boosts_thr_without_issues to decide whether to idle - * the device, then with an HDD we might easily bump into a - * scenario where queues that are sequential and I/O-bound - * would enjoy idling, whereas random queues would not. The - * latter might then get a low share of the device throughput, - * simply because the former would get many requests served - * after being set as in service, while the latter would not. - * - * To address this issue, we start by setting to true a - * sentinel variable, on_hdd_and_not_all_queues_seeky, if the - * device is rotational and not all queues with pending or - * in-flight requests are constantly seeky (i.e., there are - * active sequential queues, and bfqq might then be mistreated - * if it does not enjoy idling because it is random). - */ - all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && - bfqd->busy_in_flight_queues == - bfqd->const_seeky_busy_in_flight_queues; - - on_hdd_and_not_all_queues_seeky = - !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; - - /* - * To introduce the second case where idling needs to be - * performed to preserve service guarantees, we can note that - * allowing the drive to enqueue more than one request at a - * time, and hence delegating de facto final scheduling - * decisions to the drive's internal scheduler, causes loss of - * control on the actual request service order. In particular, - * the critical situation is when requests from different - * processes happens to be present, at the same time, in the - * internal queue(s) of the drive. In such a situation, the - * drive, by deciding the service order of the - * internally-queued requests, does determine also the actual - * throughput distribution among these processes. But the - * drive typically has no notion or concern about per-process - * throughput distribution, and makes its decisions only on a - * per-request basis. Therefore, the service distribution - * enforced by the drive's internal scheduler is likely to - * coincide with the desired device-throughput distribution - * only in a completely symmetric scenario where: + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired + * device-throughput distribution only in a completely + * symmetric scenario where: * (i) each of these processes must get the same throughput as * the others; * (ii) all these processes have the same I/O pattern @@ -2555,26 +3352,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * words, only if sub-condition (i) holds, then idling is * allowed, and the device tends to be prevented from queueing * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that, first, - * in the case of an HDD, the asymmetry in terms of types of - * I/O patterns is already taken in to account in the above - * sentinel variable - * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a - * flash-based device, we prefer however to privilege - * throughput (and idling lowers throughput for this type of - * devices), for the following reasons: - * 1) differently from HDDs, the service time of random - * requests is not orders of magnitudes lower than the service - * time of sequential requests; thus, even if processes doing - * sequential I/O get a preferential treatment with respect to - * others doing random I/O, the consequences are not as - * dramatic as with HDDs; - * 2) if a process doing random I/O does need strong - * throughput guarantees, it is hopefully already being - * weight-raised, or the user is likely to have assigned it a - * higher weight than the other processes (and thus - * sub-condition (i) is likely to be false, which triggers - * idling). + * for not controlling also sub-condition (ii) is that we + * exploit preemption to preserve guarantees in case of + * symmetric scenarios, even if (ii) does not hold, as + * explained in the next two paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. The motivation for using + * preemption instead of idling is that, by not idling, + * service guarantees are preserved without minimally + * sacrificing throughput. In other words, both a high + * throughput and its desired distribution are obtained. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * On the other hand, device idling is performed, and thus + * pure sector-domain guarantees are provided, for the + * following queues, which are likely to need stronger + * throughput guarantees: weight-raised queues, and queues + * with a higher weight than other queues. When such queues + * are active, sub-condition (i) is false, which triggers + * device idling. * * According to the above considerations, the next variable is * true (only) if sub-condition (i) holds. To compute the @@ -2582,7 +3406,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * the function bfq_symmetric_scenario(), but also check * whether bfqq is being weight-raised, because * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments to + * weight-raised queues (see comments on * bfq_weights_tree_add()). * * As a side note, it is worth considering that the above @@ -2604,17 +3428,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * bfqq. Such a case is when bfqq became active in a burst of * queue activations. Queues that became active during a large * burst benefit only from throughput, as discussed in the - * comments to bfq_handle_burst. Thus, if bfqq became active + * comments on bfq_handle_burst. Thus, if bfqq became active * in a burst and not idling the device maximizes throughput, * then the device must no be idled, because not idling the * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the two cases above, we - * can now establish when idling is actually needed to - * preserve service guarantees. + * maximum benefit. Combining this and the above case, we can + * now establish when idling is actually needed to preserve + * service guarantees. */ idling_needed_for_service_guarantees = - (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && - !bfq_bfqq_in_large_burst(bfqq); + asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); /* * We have now all the components we need to compute the return @@ -2624,6 +3447,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 2) idling either boosts the throughput (without issues), or * is necessary to preserve service guarantees. */ + bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr); + + bfq_log_bfqq(bfqd, bfqq, + "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", + bfqd->wr_busy_queues, + idling_boosts_thr_without_issues, + bfq_bfqq_IO_bound(bfqq), + idling_needed_for_service_guarantees); + return bfq_bfqq_sync(bfqq) && (idling_boosts_thr_without_issues || idling_needed_for_service_guarantees); @@ -2635,7 +3468,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments to the function bfq_bfqq_may_idle for the reasons + * See the comments on the function bfq_bfqq_may_idle for the reasons * why performing device idling is the best choice to boost the throughput * and preserve service guarantees when bfq_bfqq_may_idle itself * returns true. @@ -2665,18 +3498,33 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); if (bfq_may_expire_for_budg_timeout(bfqq) && - !timer_pending(&bfqd->idle_slice_timer) && + !hrtimer_active(&bfqd->idle_slice_timer) && !bfq_bfqq_must_idle(bfqq)) goto expire; +check_queue: + /* + * This loop is rarely executed more than once. Even when it + * happens, it is much more convenient to re-execute this loop + * than to return NULL and trigger a new dispatch to get a + * request served. + */ next_rq = bfqq->next_rq; /* * If bfqq has requests queued and it has enough budget left to * serve them, keep the queue, otherwise expire it. */ if (next_rq) { + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + if (bfq_serv_to_charge(next_rq, bfqq) > bfq_bfqq_budget_left(bfqq)) { + /* + * Expire the queue for budget exhaustion, + * which makes sure that the next budget is + * enough to serve the next request, even if + * it comes from the fifo expired path. + */ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; goto expire; } else { @@ -2685,7 +3533,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * not disable disk idling even when a new request * arrives. */ - if (timer_pending(&bfqd->idle_slice_timer)) { + if (bfq_bfqq_wait_request(bfqq)) { + BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); /* * If we get here: 1) at least a new request * has arrived but we have not disabled the @@ -2700,10 +3549,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * So we disable idling. */ bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif } goto keep_queue; } @@ -2714,7 +3561,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * for a new request, or has requests waiting for a completion and * may idle after their completion, then keep it anyway. */ - if (timer_pending(&bfqd->idle_slice_timer) || + if (hrtimer_active(&bfqd->idle_slice_timer) || (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { bfqq = NULL; goto keep_queue; @@ -2725,9 +3572,16 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_bfqq_expire(bfqd, bfqq, false, reason); new_queue: bfqq = bfq_set_in_service_queue(bfqd); - bfq_log(bfqd, "select_queue: new queue %d returned", - bfqq ? bfqq->pid : 0); + if (bfqq) { + bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); + goto check_queue; + } keep_queue: + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); + else + bfq_log(bfqd, "select_queue: no queue returned"); + return bfqq; } @@ -2736,6 +3590,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) struct bfq_entity *entity = &bfqq->entity; if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqd, bfqq, "raising period dur %u/%u msec, old coeff %u, w %d(%d)", jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), @@ -2749,22 +3606,30 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); /* - * If the queue was activated in a burst, or - * too much time has elapsed from the beginning - * of this weight-raising period, or the queue has - * exceeded the acceptable number of cooperations, - * then end weight raising. + * If the queue was activated in a burst, or too much + * time has elapsed from the beginning of this + * weight-raising period, then end weight raising. */ - if (bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - bfqq->last_wr_start_finish = jiffies; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + if (bfq_bfqq_in_large_burst(bfqq)) bfq_bfqq_end_wr(bfqq); + else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || + time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { + /* switch back to interactive wr */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = + bfqq->wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "back to interactive wr"); + } } } /* Update weight both if it must be raised and if it must be lowered */ @@ -2782,46 +3647,34 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, struct bfq_queue *bfqq) { int dispatched = 0; - struct request *rq; + struct request *rq = bfqq->next_rq; unsigned long service_to_charge; BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - - /* Follow expired path, else get first next available. */ - rq = bfq_check_fifo(bfqq); - if (!rq) - rq = bfqq->next_rq; + BUG_ON(!rq); service_to_charge = bfq_serv_to_charge(rq, bfqq); - if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { - /* - * This may happen if the next rq is chosen in fifo order - * instead of sector order. The budget is properly - * dimensioned to be always sufficient to serve the next - * request only if it is chosen in sector order. The reason - * is that it would be quite inefficient and little useful - * to always make sure that the budget is large enough to - * serve even the possible next rq in fifo order. - * In fact, requests are seldom served in fifo order. - * - * Expire the queue for budget exhaustion, and make sure - * that the next act_budget is enough to serve the next - * request, even if it comes from the fifo expired path. - */ - bfqq->next_rq = rq; - /* - * Since this dispatch is failed, make sure that - * a new one will be performed - */ - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); - goto expire; - } + BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); - /* Finally, insert request into driver dispatch list. */ bfq_bfqq_served(bfqq, service_to_charge); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + bfq_dispatch_insert(bfqd->queue, rq); + /* + * If weight raising has to terminate for bfqq, then next + * function causes an immediate update of bfqq's weight, + * without waiting for next activation. As a consequence, on + * expiration, bfqq will be timestamped as if has never been + * weight-raised during this service slot, even if it has + * received part or even most of the service as a + * weight-raised queue. This inflates bfqq's timestamps, which + * is beneficial, as bfqq is then more willing to leave the + * device immediately to possible other weight-raised queues. + */ bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, @@ -2837,9 +3690,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfqd->in_service_bic = RQ_BIC(rq); } - if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && - dispatched >= bfqd->bfq_max_budget_async_rq) || - bfq_class_idle(bfqq))) + if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) goto expire; return dispatched; @@ -2885,8 +3736,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) st = bfq_entity_service_tree(&bfqq->entity); dispatched += __bfq_forced_dispatch_bfqq(bfqq); - bfqq->max_budget = bfq_max_budget(bfqd); + bfqq->max_budget = bfq_max_budget(bfqd); bfq_forget_idle(st); } @@ -2899,37 +3750,37 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; - int max_dispatch; bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) return 0; if (unlikely(force)) return bfq_forced_dispatch(bfqd); + /* + * Force device to serve one request at a time if + * strict_guarantees is true. Forcing this service scheme is + * currently the ONLY way to guarantee that the request + * service order enforced by the scheduler is respected by a + * queueing device. Otherwise the device is free even to make + * some unlucky request wait for as long as the device + * wishes. + * + * Of course, serving one request at at time may cause loss of + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + return 0; + bfqq = bfq_select_queue(bfqd); if (!bfqq) return 0; - if (bfq_class_idle(bfqq)) - max_dispatch = 1; - - if (!bfq_bfqq_sync(bfqq)) - max_dispatch = bfqd->bfq_max_budget_async_rq; - - if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { - if (bfqd->busy_queues > 1) - return 0; - if (bfqq->dispatched >= 4 * max_dispatch) - return 0; - } - - if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) - return 0; + BUG_ON(bfqq->entity.budget < bfqq->entity.service); - bfq_clear_bfqq_wait_request(bfqq); - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + BUG_ON(bfq_bfqq_wait_request(bfqq)); if (!bfq_dispatch_request(bfqd, bfqq)) return 0; @@ -2937,6 +3788,8 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", bfq_bfqq_sync(bfqq) ? "sync" : "async"); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); return 1; } @@ -2948,23 +3801,21 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) */ static void bfq_put_queue(struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqg = bfqq_group(bfqq); #endif - BUG_ON(atomic_read(&bfqq->ref) <= 0); + BUG_ON(bfqq->ref <= 0); - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, - atomic_read(&bfqq->ref)); - if (!atomic_dec_and_test(&bfqq->ref)) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); + bfqq->ref--; + if (bfqq->ref) return; BUG_ON(rb_first(&bfqq->sort_list)); BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqd->in_service_queue == bfqq); if (bfq_bfqq_sync(bfqq)) /* @@ -2977,7 +3828,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) */ hlist_del_init(&bfqq->burst_list_node); - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -3011,8 +3862,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_schedule_dispatch(bfqd); } - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, - atomic_read(&bfqq->ref)); + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); bfq_put_cooperator(bfqq); @@ -3021,28 +3871,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) static void bfq_init_icq(struct io_cq *icq) { - struct bfq_io_cq *bic = icq_to_bic(icq); - - bic->ttime.last_end_request = jiffies; - /* - * A newly created bic indicates that the process has just - * started doing I/O, and is probably mapping into memory its - * executable and libraries: it definitely needs weight raising. - * There is however the possibility that the process performs, - * for a while, I/O close to some other process. EQM intercepts - * this behavior and may merge the queue corresponding to the - * process with some other queue, BEFORE the weight of the queue - * is raised. Merged queues are not weight-raised (they are assumed - * to belong to processes that benefit only from high throughput). - * If the merge is basically the consequence of an accident, then - * the queue will be split soon and will get back its old weight. - * It is then important to write down somewhere that this queue - * does need weight raising, even if it did not make it to get its - * weight raised before being merged. To this purpose, we overload - * the field raising_time_left and assign 1 to it, to mark the queue - * as needing weight raising. - */ - bic->wr_time_left = 1; + icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); } static void bfq_exit_icq(struct io_cq *icq) @@ -3050,21 +3879,21 @@ static void bfq_exit_icq(struct io_cq *icq) struct bfq_io_cq *bic = icq_to_bic(icq); struct bfq_data *bfqd = bic_to_bfqd(bic); - if (bic->bfqq[BLK_RW_ASYNC]) { - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); - bic->bfqq[BLK_RW_ASYNC] = NULL; + if (bic_to_bfqq(bic, false)) { + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); + bic_set_bfqq(bic, NULL, false); } - if (bic->bfqq[BLK_RW_SYNC]) { + if (bic_to_bfqq(bic, true)) { /* * If the bic is using a shared queue, put the reference * taken on the io_context when the bic started using a * shared bfq_queue. */ - if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); - bic->bfqq[BLK_RW_SYNC] = NULL; + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); + bic_set_bfqq(bic, NULL, true); } } @@ -3072,8 +3901,8 @@ static void bfq_exit_icq(struct io_cq *icq) * Update the entity prio values; note that the new values will not * be used until the next (re)activation. */ -static void -bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + struct bfq_io_cq *bic) { struct task_struct *tsk = current; int ioprio_class; @@ -3105,7 +3934,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) break; } - if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); BUG(); @@ -3113,45 +3942,40 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "set_next_ioprio_data: bic_class %d prio %d class %d", + ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); } static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { - struct bfq_data *bfqd; - struct bfq_queue *bfqq, *new_bfqq; + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_queue *bfqq; unsigned long uninitialized_var(flags); int ioprio = bic->icq.ioc->ioprio; - bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), - &flags); /* * This condition may trigger on a newly created bic, be sure to * drop the lock before returning. */ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) - goto out; + return; bic->ioprio = ioprio; - bfqq = bic->bfqq[BLK_RW_ASYNC]; + bfqq = bic_to_bfqq(bic, false); if (bfqq) { - new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, - GFP_ATOMIC); - if (new_bfqq) { - bic->bfqq[BLK_RW_ASYNC] = new_bfqq; - bfq_log_bfqq(bfqd, bfqq, - "check_ioprio_change: bfqq %p %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, bfqq->ref); } - bfqq = bic->bfqq[BLK_RW_SYNC]; + bfqq = bic_to_bfqq(bic, true); if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); - -out: - bfq_put_bfqd_unlock(bfqd, &flags); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3160,8 +3984,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - atomic_set(&bfqq->ref, 0); + bfqq->ref = 0; bfqq->bfqd = bfqd; if (bic) @@ -3171,6 +3996,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfq_class_idle(bfqq)) bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); + bfq_mark_bfqq_just_created(bfqq); } else bfq_clear_bfqq_sync(bfqq); bfq_mark_bfqq_IO_bound(bfqq); @@ -3180,72 +4006,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->pid = pid; bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = 0; + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); + bfqq->budget_timeout = bfq_smallest_from_now(); + bfqq->split_time = bfq_smallest_from_now(); + /* * Set to the value for which bfqq will not be deemed as * soft rt when it becomes backlogged. */ - bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); -} - -static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, - gfp_t gfp_mask) -{ - struct bfq_group *bfqg; - struct bfq_queue *bfqq, *new_bfqq = NULL; - struct blkcg *blkcg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - bfqg = bfq_find_alloc_group(bfqd, blkcg); - /* bic always exists here */ - bfqq = bic_to_bfqq(bic, is_sync); - - /* - * Always try a new alloc if we fall back to the OOM bfqq - * originally, since it should just be a temporary situation. - */ - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = NULL; - if (new_bfqq) { - bfqq = new_bfqq; - new_bfqq = NULL; - } else if (gfpflags_allow_blocking(gfp_mask)) { - rcu_read_unlock(); - spin_unlock_irq(bfqd->queue->queue_lock); - new_bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - spin_lock_irq(bfqd->queue->queue_lock); - if (new_bfqq) - goto retry; - } else { - bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - } - - if (bfqq) { - bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); - bfq_init_entity(&bfqq->entity, bfqg); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { - bfqq = &bfqd->oom_bfqq; - bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); - } - } - - if (new_bfqq) - kmem_cache_free(bfq_pool, new_bfqq); - - rcu_read_unlock(); + bfqq->soft_rt_next_start = bfq_greatest_from_now(); - return bfqq; + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, @@ -3268,90 +4041,93 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask) + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq = NULL; + struct bfq_queue *bfqq; + struct bfq_group *bfqg; - if (!is_sync) { - struct blkcg *blkcg; - struct bfq_group *bfqg; + rcu_read_lock(); + + bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + if (!bfqg) { + bfqq = &bfqd->oom_bfqq; + goto out; + } - rcu_read_lock(); - blkcg = bio_blkcg(bio); - rcu_read_unlock(); - bfqg = bfq_find_alloc_group(bfqd, blkcg); + if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); bfqq = *async_bfqq; + if (bfqq) + goto out; } - if (!bfqq) - bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); + bfqq = kmem_cache_alloc_node(bfq_pool, + GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + bfqd->queue->node); + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + goto out; + } /* * Pin the queue now that it's allocated, scheduler exit will * prune it. */ - if (!is_sync && !(*async_bfqq)) { - atomic_inc(&bfqq->ref); + if (async_bfqq) { + bfqq->ref++; /* + * Extra group reference, w.r.t. sync + * queue. This extra reference is removed + * only if bfqq->bfqg disappears, to + * guarantee that this queue is not freed + * until its group goes away. + */ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); *async_bfqq = bfqq; } - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, - atomic_read(&bfqq->ref)); +out: + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + rcu_read_unlock(); return bfqq; } static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_io_cq *bic) { - unsigned long elapsed = jiffies - bic->ttime.last_end_request; - unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); + struct bfq_ttime *ttime = &bic->ttime; + u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; - bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; - bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; - bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / - bic->ttime.ttime_samples; + elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); + + ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); + ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, + ttime->ttime_samples); } -static void bfq_update_io_seektime(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *rq) +static void +bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) { - sector_t sdist; - u64 total; - - if (bfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - bfqq->last_request_pos; - else - sdist = bfqq->last_request_pos - blk_rq_pos(rq); - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc. - */ - if (bfqq->seek_samples == 0) /* first request, not really a seek */ - sdist = 0; - else if (bfqq->seek_samples <= 60) /* second & third seek */ - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); - - bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; - bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; - total = bfqq->seek_total + (bfqq->seek_samples/2); - do_div(total, bfqq->seek_samples); - bfqq->seek_mean = (sector_t)total; - - bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, - (u64)bfqq->seek_mean); + bfqq->seek_history <<= 1; + bfqq->seek_history |= + get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && + (!blk_queue_nonrot(bfqd->queue) || + blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); } /* @@ -3369,7 +4145,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, return; /* Idle window just restored, statistics are meaningless. */ - if (bfq_bfqq_just_split(bfqq)) + if (time_is_after_eq_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) return; enable_idle = bfq_bfqq_idle_window(bfqq); @@ -3409,22 +4186,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_update_io_thinktime(bfqd, bic); bfq_update_io_seektime(bfqd, bfqq, rq); - if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { - bfq_clear_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); - bfq_clear_bfqq_just_split(bfqq); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), - (unsigned long long) bfqq->seek_mean); + "rq_enqueued: idle_window=%d (seeky %d)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); @@ -3438,14 +4206,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * is small and the queue is not to be expired, then * just exit. * - * In this way, if the disk is being idled to wait for - * a new request from the in-service queue, we avoid - * unplugging the device and committing the disk to serve - * just a small request. On the contrary, we wait for - * the block layer to decide when to unplug the device: - * hopefully, new requests will be merged to this one - * quickly, then the device will be unplugged and - * larger requests will be dispatched. + * In this way, if the device is being idled to wait + * for a new request from the in-service queue, we + * avoid unplugging the device and committing the + * device to serve just a small request. On the + * contrary, we wait for the block layer to decide + * when to unplug the device: hopefully, new requests + * will be merged to this one quickly, then the device + * will be unplugged and larger requests will be + * dispatched. */ if (small_req && !budget_timeout) return; @@ -3457,10 +4226,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * timer. */ bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif /* * The queue is not empty, because a new request just @@ -3504,28 +4271,20 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) */ new_bfqq->allocated[rq_data_dir(rq)]++; bfqq->allocated[rq_data_dir(rq)]--; - atomic_inc(&new_bfqq->ref); + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); bfq_put_queue(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); rq->elv.priv[1] = new_bfqq; bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); + } } bfq_add_request(rq); - /* - * Here a newly-created bfq_queue has already started a weight-raising - * period: clear raising_time_left to prevent bfq_bfqq_save_state() - * from assigning it a full weight-raising period. See the detailed - * comments about this field in bfq_init_icq(). - */ - if (bfqq->bic) - bfqq->bic->wr_time_left = 0; - rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -3533,8 +4292,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) static void bfq_update_hw_tag(struct bfq_data *bfqd) { - bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, + bfqd->rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -3560,48 +4319,85 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd = bfqq->bfqd; - bool sync = bfq_bfqq_sync(bfqq); + u64 now_ns; + u32 delta_us; - bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", - blk_rq_sectors(rq), sync); + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", + blk_rq_sectors(rq)); + assert_spin_locked(bfqd->queue->queue_lock); bfq_update_hw_tag(bfqd); BUG_ON(!bfqd->rq_in_driver); BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_completion(bfqq_group(bfqq), rq_start_time_ns(rq), - rq_io_start_time_ns(rq), rq->cmd_flags); -#endif + rq_io_start_time_ns(rq), + rq->cmd_flags); if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + /* + * Set budget_timeout (which we overload to store the + * time at which the queue remains with no backlog and + * no outstanding request; used by the weight-raising + * mechanism). + */ + bfqq->budget_timeout = jiffies; + bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } } - if (sync) { - bfqd->sync_flight--; - RQ_BIC(rq)->ttime.last_end_request = jiffies; - } + now_ns = ktime_get_ns(); + + RQ_BIC(rq)->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in + * computing rate in next check. + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* + * If the request took rather long to complete, and, according + * to the maximum request size recorded, this completion latency + * implies that the request was certainly served at a very low + * rate (less than 1M sectors/sec), then the whole observation + * interval that lasts up to this time instant cannot be a + * valid time interval for computing a new peak rate. Invoke + * bfq_update_rate_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - reset to zero samples, which will trigger a proper + * re-initialization of the observation interval on next + * dispatch + */ + if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && + (bfqd->last_rq_max_size<last_completion = now_ns; /* - * If we are waiting to discover whether the request pattern of the - * task associated with the queue is actually isochronous, and - * both requisites for this condition to hold are satisfied, then - * compute soft_rt_next_start (see the comments to the function - * bfq_bfqq_softrt_next_start()). + * If we are waiting to discover whether the request pattern + * of the task associated with the queue is actually + * isochronous, and both requisites for this condition to hold + * are now satisfied, then compute soft_rt_next_start (see the + * comments on the function bfq_bfqq_softrt_next_start()). We + * schedule this delayed check when bfqq expires, if it still + * has in-flight requests. */ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) @@ -3613,10 +4409,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) * or if we want to idle in case it has no pending requests. */ if (bfqd->in_service_queue == bfqq) { - if (bfq_bfqq_budget_new(bfqq)) - bfq_set_budget_timeout(bfqd); - - if (bfq_bfqq_must_idle(bfqq)) { + if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { bfq_arm_slice_timer(bfqd); goto out; } else if (bfq_may_expire_for_budg_timeout(bfqq)) @@ -3646,7 +4439,7 @@ static int __bfq_may_queue(struct bfq_queue *bfqq) return ELV_MQUEUE_MAY; } -static int bfq_may_queue(struct request_queue *q, int rw) +static int bfq_may_queue(struct request_queue *q, unsigned int op) { struct bfq_data *bfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -3663,7 +4456,7 @@ static int bfq_may_queue(struct request_queue *q, int rw) if (!bic) return ELV_MQUEUE_MAY; - bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); + bfqq = bic_to_bfqq(bic, op_is_sync(op)); if (bfqq) return __bfq_may_queue(bfqq); @@ -3687,14 +4480,14 @@ static void bfq_put_request(struct request *rq) rq->elv.priv[1] = NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); } } /* * Returns NULL if a new bfqq should be allocated, or the old bfqq if this - * was the last process referring to said bfqq. + * was the last process referring to that bfqq. */ static struct bfq_queue * bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) @@ -3732,11 +4525,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, unsigned long flags; bool split = false; - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - - bfq_check_ioprio_change(bic, bio); - spin_lock_irqsave(q->queue_lock, flags); + bfq_check_ioprio_change(bic, bio); if (!bic) goto queue_fail; @@ -3746,23 +4536,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, new_queue: bfqq = bic_to_bfqq(bic, is_sync); if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, + bic->saved_in_large_burst, + bfqd->large_burst); + if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: marking in " + "large burst"); bfq_mark_bfqq_in_large_burst(bfqq); - else { + } else { + bfq_log_bfqq(bfqd, bfqq, + "set_request: clearing in " + "large burst"); bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } + bfqq->split_time = jiffies; } } else { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->saved_in_large_burst = true; + bfqq = bfq_split_bfqq(bic, bfqq); split = true; if (!bfqq) @@ -3771,9 +4585,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, } bfqq->allocated[rw]++; - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, - atomic_read(&bfqq->ref)); + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; @@ -3788,7 +4601,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { bfqq->bic = bic; if (split) { - bfq_mark_bfqq_just_split(bfqq); /* * If the queue has just been split from a shared * queue, restore the idle window and the possible @@ -3798,6 +4610,9 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, } } + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -3824,9 +4639,10 @@ static void bfq_kick_queue(struct work_struct *work) * Handler of the expiration of the timer running if the in-service queue * is idling inside its time slice. */ -static void bfq_idle_slice_timer(unsigned long data) +static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) { - struct bfq_data *bfqd = (struct bfq_data *)data; + struct bfq_data *bfqd = container_of(timer, struct bfq_data, + idle_slice_timer); struct bfq_queue *bfqq; unsigned long flags; enum bfqq_expiration reason; @@ -3844,6 +4660,8 @@ static void bfq_idle_slice_timer(unsigned long data) */ if (bfqq) { bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + bfq_clear_bfqq_wait_request(bfqq); + if (bfq_bfqq_budget_timeout(bfqq)) /* * Also here the queue can be safely expired @@ -3869,11 +4687,12 @@ static void bfq_idle_slice_timer(unsigned long data) bfq_schedule_dispatch(bfqd); spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); + return HRTIMER_NORESTART; } static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) { - del_timer_sync(&bfqd->idle_slice_timer); + hrtimer_cancel(&bfqd->idle_slice_timer); cancel_work_sync(&bfqd->unplug_work); } @@ -3885,9 +4704,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, bfq_log(bfqd, "put_async_bfqq: %p", bfqq); if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_bfqq_move(bfqd, bfqq, root_group); bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); *bfqq_ptr = NULL; } @@ -3922,19 +4741,18 @@ static void bfq_exit_queue(struct elevator_queue *e) BUG_ON(bfqd->in_service_queue); list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, 0); + bfq_deactivate_bfqq(bfqd, bfqq, false, false); spin_unlock_irq(q->queue_lock); bfq_shutdown_timer_wq(bfqd); - synchronize_rcu(); - - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(q, &blkcg_policy_bfq); #else + bfq_put_async_queues(bfqd, bfqd->root_group); kfree(bfqd->root_group); #endif @@ -3954,6 +4772,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, root_group->rq_pos_tree = RB_ROOT; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + root_group->sched_data.bfq_class_idle_last_service = jiffies; } static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) @@ -3978,11 +4797,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * will not attempt to free it. */ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); - atomic_inc(&bfqd->oom_bfqq.ref); + bfqd->oom_bfqq.ref++; bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; bfqd->oom_bfqq.entity.new_weight = bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); + + /* oom_bfqq does not participate to bursts */ + bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); /* * Trigger weight initialization, according to ioprio, at the * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio @@ -4001,13 +4823,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqd->active_numerous_groups = 0; -#endif - init_timer(&bfqd->idle_slice_timer); + hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); bfqd->idle_slice_timer.function = bfq_idle_slice_timer; - bfqd->idle_slice_timer.data = (unsigned long)bfqd; bfqd->queue_weights_tree = RB_ROOT; bfqd->group_weights_tree = RB_ROOT; @@ -4027,21 +4846,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_back_max = bfq_back_max; bfqd->bfq_back_penalty = bfq_back_penalty; bfqd->bfq_slice_idle = bfq_slice_idle; - bfqd->bfq_class_idle_last_service = 0; - bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; - bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; - bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_coop_thresh = 2; - bfqd->bfq_failed_cooperations = 7000; bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 11; - bfqd->bfq_burst_interval = msecs_to_jiffies(500); + bfqd->bfq_large_burst_thresh = 8; + bfqd->bfq_burst_interval = msecs_to_jiffies(180); bfqd->low_latency = true; - bfqd->bfq_wr_coeff = 20; + /* + * Trade-off between responsiveness and fairness. + */ + bfqd->bfq_wr_coeff = 30; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); @@ -4053,16 +4870,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * video. */ bfqd->wr_busy_queues = 0; - bfqd->busy_in_flight_queues = 0; - bfqd->const_seeky_busy_in_flight_queues = 0; /* - * Begin by assuming, optimistically, that the device peak rate is - * equal to the highest reference rate. + * Begin by assuming, optimistically, that the device is a + * high-speed one, and that its peak rate is equal to 2/3 of + * the highest reference rate. */ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * T_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; bfqd->device_speed = BFQ_BFQD_FAST; return 0; @@ -4088,7 +4904,7 @@ static int __init bfq_slab_setup(void) static ssize_t bfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t bfq_var_store(unsigned long *var, const char *page, @@ -4159,21 +4975,21 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct bfq_data *bfqd = e->elevator_data; \ - unsigned int __data = __VAR; \ - if (__CONV) \ + u64 __data = __VAR; \ + if (__CONV == 1) \ __data = jiffies_to_msecs(__data); \ + else if (__CONV == 2) \ + __data = div_u64(__data, NSEC_PER_MSEC); \ return bfq_var_show(__data, (page)); \ } -SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); -SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_max_budget_async_rq_show, - bfqd->bfq_max_budget_async_rq, 0); -SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); -SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); +SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); @@ -4183,6 +4999,17 @@ SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); #undef SHOW_FUNCTION +#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + __data = div_u64(__data, NSEC_PER_USEC); \ + return bfq_var_show(__data, (page)); \ +} +USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); +#undef USEC_SHOW_FUNCTION + #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t \ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ @@ -4194,24 +5021,22 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ - if (__CONV) \ + if (__CONV == 1) \ *(__PTR) = msecs_to_jiffies(__data); \ + else if (__CONV == 2) \ + *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ *(__PTR) = __data; \ return ret; \ } STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, - INT_MAX, 1); + INT_MAX, 2); STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, - INT_MAX, 1); + INT_MAX, 2); STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, INT_MAX, 0); -STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, - 1, INT_MAX, 0); -STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, - INT_MAX, 1); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, @@ -4224,6 +5049,23 @@ STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, INT_MAX, 0); #undef STORE_FUNCTION +#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + *(__PTR) = (u64)__data * NSEC_PER_USEC; \ + return ret; \ +} +USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, + UINT_MAX); +#undef USEC_STORE_FUNCTION + /* do nothing for the moment */ static ssize_t bfq_weights_store(struct elevator_queue *e, const char *page, size_t count) @@ -4231,16 +5073,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, return count; } -static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -{ - u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - - if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) - return bfq_calc_max_budget(bfqd->peak_rate, timeout); - else - return bfq_default_max_budget; -} - static ssize_t bfq_max_budget_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4249,7 +5081,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, int ret = bfq_var_store(&__data, (page), count); if (__data == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); else { if (__data > INT_MAX) __data = INT_MAX; @@ -4261,6 +5093,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, return ret; } +/* + * Leaving this name to preserve name compatibility with cfq + * parameters, but this timeout is used for both sync and async. + */ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4273,9 +5109,27 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, else if (__data > INT_MAX) __data = INT_MAX; - bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + bfqd->bfq_timeout = msecs_to_jiffies(__data); if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (!bfqd->strict_guarantees && __data == 1 + && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) + bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; + + bfqd->strict_guarantees = __data; return ret; } @@ -4305,10 +5159,10 @@ static struct elv_fs_entry bfq_attrs[] = { BFQ_ATTR(back_seek_max), BFQ_ATTR(back_seek_penalty), BFQ_ATTR(slice_idle), + BFQ_ATTR(slice_idle_us), BFQ_ATTR(max_budget), - BFQ_ATTR(max_budget_async_rq), BFQ_ATTR(timeout_sync), - BFQ_ATTR(timeout_async), + BFQ_ATTR(strict_guarantees), BFQ_ATTR(low_latency), BFQ_ATTR(wr_coeff), BFQ_ATTR(wr_max_time), @@ -4328,7 +5182,8 @@ static struct elevator_type iosched_bfq = { #ifdef CONFIG_BFQ_GROUP_IOSCHED .elevator_bio_merged_fn = bfq_bio_merged, #endif - .elevator_allow_merge_fn = bfq_allow_merge, + .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, + .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, .elevator_dispatch_fn = bfq_dispatch_requests, .elevator_add_req_fn = bfq_insert_request, .elevator_activate_req_fn = bfq_activate_request, @@ -4351,18 +5206,28 @@ static struct elevator_type iosched_bfq = { .elevator_owner = THIS_MODULE, }; +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, +}; +#endif + static int __init bfq_init(void) { int ret; - - /* - * Can be 0 on HZ < 1000 setups. - */ - if (bfq_slice_idle == 0) - bfq_slice_idle = 1; - - if (bfq_timeout_async == 0) - bfq_timeout_async = 1; + char msg[60] = "BFQ I/O-scheduler: v8r8"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -4375,27 +5240,46 @@ static int __init bfq_init(void) goto err_pol_unreg; /* - * Times to load large popular applications for the typical systems - * installed on the reference devices (see the comments before the - * definitions of the two arrays). + * Times to load large popular applications for the typical + * systems installed on the reference devices (see the + * comments before the definitions of the next two + * arrays). Actually, we use slightly slower values, as the + * estimated peak rate tends to be smaller than the actual + * peak rate. The reason for this last fact is that estimates + * are computed over much shorter time intervals than the long + * intervals typically used for benchmarking. Why? First, to + * adapt more quickly to variations. Second, because an I/O + * scheduler cannot rely on a peak-rate-evaluation workload to + * be run for a long time. */ - T_slow[0] = msecs_to_jiffies(2600); - T_slow[1] = msecs_to_jiffies(1000); - T_fast[0] = msecs_to_jiffies(5500); - T_fast[1] = msecs_to_jiffies(2000); + T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ + T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ + T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ /* - * Thresholds that determine the switch between speed classes (see - * the comments before the definition of the array). + * Thresholds that determine the switch between speed classes + * (see the comments before the definition of the array + * device_speed_thresh). These thresholds are biased towards + * transitions to the fast class. This is safer than the + * opposite bias. In fact, a wrong transition to the slow + * class results in short weight-raising periods, because the + * speed of the device then tends to be higher that the + * reference peak rate. On the opposite end, a wrong + * transition to the fast class tends to increase + * weight-raising periods, because of the opposite reason. */ - device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; - device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; + device_speed_thresh[0] = (4 * R_slow[0]) / 3; + device_speed_thresh[1] = (4 * R_slow[1]) / 3; ret = elv_register(&iosched_bfq); if (ret) goto err_pol_unreg; - pr_info("BFQ I/O-scheduler: v7r11"); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); return 0; diff --git a/block/bfq-sched.c b/block/bfq-sched.c index a5ed694..2e9dc59 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -7,28 +7,166 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente + */ + +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) +{ + struct rb_node *node = tree->rb_node; + + return rb_entry(node, struct bfq_entity, rb_node); +} + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); + +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); + +/** + * bfq_update_next_in_service - update sd->next_in_service + * @sd: sched_data for which to perform the update. + * @new_entity: if not NULL, pointer to the entity whose activation, + * requeueing or repositionig triggered the invocation of + * this function. + * + * This function is called to update sd->next_in_service, which, in + * its turn, may change as a consequence of the insertion or + * extraction of an entity into/from one of the active trees of + * sd. These insertions/extractions occur as a consequence of + * activations/deactivations of entities, with some activations being + * 'true' activations, and other activations being requeueings (i.e., + * implementing the second, requeueing phase of the mechanism used to + * reposition an entity in its active tree; see comments on + * __bfq_activate_entity and __bfq_requeue_entity for details). In + * both the last two activation sub-cases, new_entity points to the + * just activated or requeued entity. + * + * Returns true if sd->next_in_service changes in such a way that + * entity->parent may become the next_in_service for its parent + * entity. */ +static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *new_entity) +{ + struct bfq_entity *next_in_service = sd->next_in_service; + struct bfq_queue *bfqq; + bool parent_sched_may_change = false; + + /* + * If this update is triggered by the activation, requeueing + * or repositiong of an entity that does not coincide with + * sd->next_in_service, then a full lookup in the active tree + * can be avoided. In fact, it is enough to check whether the + * just-modified entity has a higher priority than + * sd->next_in_service, or, even if it has the same priority + * as sd->next_in_service, is eligible and has a lower virtual + * finish time than sd->next_in_service. If this compound + * condition holds, then the new entity becomes the new + * next_in_service. Otherwise no change is needed. + */ + if (new_entity && new_entity != sd->next_in_service) { + /* + * Flag used to decide whether to replace + * sd->next_in_service with new_entity. Tentatively + * set to true, and left as true if + * sd->next_in_service is NULL. + */ + bool replace_next = true; + + /* + * If there is already a next_in_service candidate + * entity, then compare class priorities or timestamps + * to decide whether to replace sd->service_tree with + * new_entity. + */ + if (next_in_service) { + unsigned int new_entity_class_idx = + bfq_class_idx(new_entity); + struct bfq_service_tree *st = + sd->service_tree + new_entity_class_idx; + + /* + * For efficiency, evaluate the most likely + * sub-condition first. + */ + replace_next = + (new_entity_class_idx == + bfq_class_idx(next_in_service) + && + !bfq_gt(new_entity->start, st->vtime) + && + bfq_gt(next_in_service->finish, + new_entity->finish)) + || + new_entity_class_idx < + bfq_class_idx(next_in_service); + } + + if (replace_next) + next_in_service = new_entity; + } else /* invoked because of a deactivation: lookup needed */ + next_in_service = bfq_lookup_next_entity(sd); + + if (next_in_service) { + parent_sched_may_change = !sd->next_in_service || + bfq_update_parent_budget(next_in_service); + } + + sd->next_in_service = next_in_service; + + if (!next_in_service) + return parent_sched_may_change; + bfqq = bfq_entity_to_bfqq(next_in_service); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_next_in_service: chosen this queue"); #ifdef CONFIG_BFQ_GROUP_IOSCHED -#define for_each_entity(entity) \ + else { + struct bfq_group *bfqg = + container_of(next_in_service, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_next_in_service: chosen this entity"); + } +#endif + return parent_sched_may_change; +} + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +/* both next loops stop at one of the child entities of the root group */ +#define for_each_entity(entity) \ for (; entity ; entity = entity->parent) #define for_each_entity_safe(entity, parent) \ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) - -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, - int extract, - struct bfq_data *bfqd); - -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - -static void bfq_update_budget(struct bfq_entity *next_in_service) +/* + * Returns true if this budget changes may let next_in_service->parent + * become the next_in_service entity for its parent entity. + */ +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) { struct bfq_entity *bfqg_entity; struct bfq_group *bfqg; struct bfq_sched_data *group_sd; + bool ret = false; BUG_ON(!next_in_service); @@ -41,60 +179,68 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) * as it must never become an in-service entity. */ bfqg_entity = bfqg->my_entity; - if (bfqg_entity) + if (bfqg_entity) { + if (bfqg_entity->budget > next_in_service->budget) + ret = true; bfqg_entity->budget = next_in_service->budget; + } + + return ret; } -static int bfq_update_next_in_service(struct bfq_sched_data *sd) +/* + * This function tells whether entity stops being a candidate for next + * service, according to the following logic. + * + * This function is invoked for an entity that is about to be set in + * service. If such an entity is a queue, then the entity is no longer + * a candidate for next service (i.e, a candidate entity to serve + * after the in-service entity is expired). The function then returns + * true. + * + * In contrast, the entity could stil be a candidate for next service + * if it is not a queue, and has more than one child. In fact, even if + * one of its children is about to be set in service, other children + * may still be the next to serve. As a consequence, a non-queue + * entity is not a candidate for next-service only if it has only one + * child. And only if this condition holds, then the function returns + * true for a non-queue entity. + */ +static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) { - struct bfq_entity *next_in_service; + struct bfq_group *bfqg; - if (sd->in_service_entity) - /* will update/requeue at the end of service */ - return 0; + if (bfq_entity_to_bfqq(entity)) + return true; - /* - * NOTE: this can be improved in many ways, such as returning - * 1 (and thus propagating upwards the update) only when the - * budget changes, or caching the bfqq that will be scheduled - * next from this subtree. By now we worry more about - * correctness than about performance... - */ - next_in_service = bfq_lookup_next_entity(sd, 0, NULL); - sd->next_in_service = next_in_service; + bfqg = container_of(entity, struct bfq_group, entity); - if (next_in_service) - bfq_update_budget(next_in_service); + BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); + BUG_ON(bfqg->active_entities == 0); + if (bfqg->active_entities == 1) + return true; - return 1; + return false; } -static void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) -{ - BUG_ON(sd->next_in_service != entity); -} -#else +#else /* CONFIG_BFQ_GROUP_IOSCHED */ #define for_each_entity(entity) \ for (; entity ; entity = NULL) #define for_each_entity_safe(entity, parent) \ for (parent = NULL; entity ; entity = parent) -static int bfq_update_next_in_service(struct bfq_sched_data *sd) +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) { - return 0; + return false; } -static void bfq_check_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *entity) +static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) { + return true; } -static void bfq_update_budget(struct bfq_entity *next_in_service) -{ -} -#endif +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ /* * Shift for timestamp calculations. This actually limits the maximum @@ -105,18 +251,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) */ #define WFQ_SERVICE_SHIFT 22 -/** - * bfq_gt - compare two timestamps. - * @a: first ts. - * @b: second ts. - * - * Return @a > @b, dealing with wrapping correctly. - */ -static int bfq_gt(u64 a, u64 b) -{ - return (s64)(a - b) > 0; -} - static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) { struct bfq_queue *bfqq = NULL; @@ -151,20 +285,36 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned long long start, finish, delta; BUG_ON(entity->weight == 0); entity->finish = entity->start + bfq_delta(service, entity->weight); + start = ((entity->start>>10)*1000)>>12; + finish = ((entity->finish>>10)*1000)>>12; + delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; + if (bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: serv %lu, w %d", service, entity->weight); bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: start %llu, finish %llu, delta %llu", - entity->start, entity->finish, - bfq_delta(service, entity->weight)); + start, finish, delta); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: start %llu, finish %llu, delta %llu", + start, finish, delta); +#endif } } @@ -293,10 +443,26 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) static void bfq_update_active_node(struct rb_node *node) { struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->min_start = entity->start; bfq_update_min(entity, node->rb_right); bfq_update_min(entity, node->rb_left); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#endif + } } /** @@ -386,8 +552,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, BUG_ON(!bfqg); BUG_ON(!bfqd); bfqg->active_entities++; - if (bfqg->active_entities == 2) - bfqd->active_numerous_groups++; } #endif } @@ -399,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, static unsigned short bfq_ioprio_to_weight(int ioprio) { BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; + return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -422,9 +586,9 @@ static void bfq_get_entity(struct bfq_entity *entity) struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); if (bfqq) { - atomic_inc(&bfqq->ref); + bfqq->ref++; bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); } } @@ -499,10 +663,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, BUG_ON(!bfqd); BUG_ON(!bfqg->active_entities); bfqg->active_entities--; - if (bfqg->active_entities == 1) { - BUG_ON(!bfqd->active_numerous_groups); - bfqd->active_numerous_groups--; - } } #endif } @@ -547,12 +707,12 @@ static void bfq_forget_entity(struct bfq_service_tree *st, BUG_ON(!entity->on_st); - entity->on_st = 0; + entity->on_st = false; st->wsum -= entity->weight; if (bfqq) { sd = entity->sched_data; bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); } } @@ -602,7 +762,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned short prev_weight, new_weight; + unsigned int prev_weight, new_weight; struct bfq_data *bfqd = NULL; struct rb_root *root; #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -630,7 +790,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, entity->new_weight > BFQ_MAX_WEIGHT) { pr_crit("update_weight_prio: new_weight %d\n", entity->new_weight); - BUG(); + if (entity->new_weight < BFQ_MIN_WEIGHT) + entity->new_weight = BFQ_MIN_WEIGHT; + else + entity->new_weight = BFQ_MAX_WEIGHT; } entity->orig_weight = entity->new_weight; if (bfqq) @@ -661,6 +824,13 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * associated with its new weight. */ if (prev_weight != new_weight) { + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "weight changed %d %d(%d %d)", + prev_weight, new_weight, + entity->orig_weight, + bfqq->wr_coeff); + root = bfqq ? &bfqd->queue_weights_tree : &bfqd->group_weights_tree; bfq_weights_tree_remove(bfqd, entity, root); @@ -707,7 +877,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st = bfq_entity_service_tree(entity); entity->service += served; - BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); st->vtime += bfq_delta(served, st->wsum); @@ -716,234 +886,574 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) #ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); #endif - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); + st = bfq_entity_service_tree(&bfqq->entity); + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", + served, ((st->vtime>>10)*1000)>>12, st); } /** - * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * bfq_bfqq_charge_time - charge an amount of service equivalent to the length + * of the time interval during which bfqq has been in + * service. + * @bfqd: the device * @bfqq: the queue that needs a service update. + * @time_ms: the amount of time during which the queue has received service + * + * If a queue does not consume its budget fast enough, then providing + * the queue with service fairness may impair throughput, more or less + * severely. For this reason, queues that consume their budget slowly + * are provided with time fairness instead of service fairness. This + * goal is achieved through the BFQ scheduling engine, even if such an + * engine works in the service, and not in the time domain. The trick + * is charging these queues with an inflated amount of service, equal + * to the amount of service that they would have received during their + * service slot if they had been fast, i.e., if their requests had + * been dispatched at a rate equal to the estimated peak rate. * - * When it's not possible to be fair in the service domain, because - * a queue is not consuming its budget fast enough (the meaning of - * fast depends on the timeout parameter), we charge it a full - * budget. In this way we should obtain a sort of time-domain - * fairness among all the seeky/slow queues. + * It is worth noting that time fairness can cause important + * distortions in terms of bandwidth distribution, on devices with + * internal queueing. The reason is that I/O requests dispatched + * during the service slot of a queue may be served after that service + * slot is finished, and may have a total processing time loosely + * correlated with the duration of the service slot. This is + * especially true for short service slots. */ -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, + unsigned long time_ms) { struct bfq_entity *entity = &bfqq->entity; + int tot_serv_to_charge = entity->service; + unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); + + if (time_ms > 0 && time_ms < timeout_ms) + tot_serv_to_charge = + (bfqd->bfq_max_budget * time_ms) / timeout_ms; - bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + if (tot_serv_to_charge < entity->service) + tot_serv_to_charge = entity->service; - bfq_bfqq_served(bfqq, entity->budget - entity->service); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "charge_time: %lu/%u ms, %d/%d/%d sectors", + time_ms, timeout_ms, entity->service, + tot_serv_to_charge, entity->budget); + + /* Increase budget to avoid inconsistencies */ + if (tot_serv_to_charge > entity->budget) + entity->budget = tot_serv_to_charge; + + bfq_bfqq_served(bfqq, + max_t(int, 0, tot_serv_to_charge - entity->service)); +} + +static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + struct bfq_service_tree *st, + bool backshifted) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_sched_data *sd = entity->sched_data; + + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); + + /* + * If some queues enjoy backshifting for a while, then their + * (virtual) finish timestamps may happen to become lower and + * lower than the system virtual time. In particular, if + * these queues often happen to be idle for short time + * periods, and during such time periods other queues with + * higher timestamps happen to be busy, then the backshifted + * timestamps of the former queues can become much lower than + * the system virtual time. In fact, to serve the queues with + * higher timestamps while the ones with lower timestamps are + * idle, the system virtual time may be pushed-up to much + * higher values than the finish timestamps of the idle + * queues. As a consequence, the finish timestamps of all new + * or newly activated queues may end up being much larger than + * those of lucky queues with backshifted timestamps. The + * latter queues may then monopolize the device for a lot of + * time. This would simply break service guarantees. + * + * To reduce this problem, push up a little bit the + * backshifted timestamps of the queue associated with this + * entity (only a queue can happen to have the backshifted + * flag set): just enough to let the finish timestamp of the + * queue be equal to the current value of the system virtual + * time. This may introduce a little unfairness among queues + * with backshifted timestamps, but it does not break + * worst-case fairness guarantees. + * + * As a special case, if bfqq is weight-raised, push up + * timestamps much less, to keep very low the probability that + * this push up causes the backshifted finish timestamps of + * weight-raised queues to become higher than the backshifted + * finish timestamps of non weight-raised queues. + */ + if (backshifted && bfq_gt(st->vtime, entity->finish)) { + unsigned long delta = st->vtime - entity->finish; + + if (bfqq) + delta /= bfqq->wr_coeff; + + entity->start += delta; + entity->finish += delta; + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: new queue finish %llu", + ((entity->finish>>10)*1000)>>12); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: new group finish %llu", + ((entity->finish>>10)*1000)>>12); +#endif + } + } + + bfq_active_insert(st, entity); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: queue %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: group %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#endif + } + BUG_ON(RB_EMPTY_ROOT(&st->active)); + BUG_ON(&st->active != &sd->service_tree->active && + &st->active != &(sd->service_tree+1)->active && + &st->active != &(sd->service_tree+2)->active); } /** - * __bfq_activate_entity - activate an entity. + * __bfq_activate_entity - handle activation of entity. * @entity: the entity being activated. + * @non_blocking_wait_rq: true if entity was waiting for a request + * + * Called for a 'true' activation, i.e., if entity is not active and + * one of its children receives a new request. * - * Called whenever an entity is activated, i.e., it is not active and one - * of its children receives a new request, or has to be reactivated due to - * budget exhaustion. It uses the current budget of the entity (and the - * service received if @entity is active) of the queue to calculate its - * timestamps. + * Basically, this function updates the timestamps of entity and + * inserts entity into its active tree, ater possible extracting it + * from its idle tree. */ -static void __bfq_activate_entity(struct bfq_entity *entity) +static void __bfq_activate_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq) { struct bfq_sched_data *sd = entity->sched_data; struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + bool backshifted = false; + unsigned long long min_vstart; - if (entity == sd->in_service_entity) { - BUG_ON(entity->tree); - /* - * If we are requeueing the current entity we have - * to take care of not charging to it service it has - * not received. - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; - sd->in_service_entity = NULL; - } else if (entity->tree == &st->active) { - /* - * Requeueing an entity due to a change of some - * next_in_service entity below it. We reuse the - * old start time. - */ - bfq_active_extract(st, entity); - } else if (entity->tree == &st->idle) { + BUG_ON(!sd); + BUG_ON(!st); + + /* See comments on bfq_fqq_update_budg_for_activation */ + if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { + backshifted = true; + min_vstart = entity->finish; + } else + min_vstart = st->vtime; + + if (entity->tree == &st->idle) { /* * Must be on the idle tree, bfq_idle_extract() will * check for that. */ bfq_idle_extract(st, entity); - entity->start = bfq_gt(st->vtime, entity->finish) ? - st->vtime : entity->finish; + entity->start = bfq_gt(min_vstart, entity->finish) ? + min_vstart : entity->finish; } else { /* * The finish time of the entity may be invalid, and * it is in the past for sure, otherwise the queue * would have been on the idle tree. */ - entity->start = st->vtime; + entity->start = min_vstart; st->wsum += entity->weight; bfq_get_entity(entity); - BUG_ON(entity->on_st); - entity->on_st = 1; + BUG_ON(entity->on_st && bfqq); + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (entity->on_st && !bfqq) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, + entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, + bfqg, + "activate bug, class %d in_service %p", + bfq_class_idx(entity), sd->in_service_entity); + } +#endif + BUG_ON(entity->on_st && !bfqq); + entity->on_st = true; } - st = __bfq_entity_update_weight_prio(st, entity); - bfq_calc_finish(entity, entity->budget); - bfq_active_insert(st, entity); + bfq_update_fin_time_enqueue(entity, st, backshifted); } /** - * bfq_activate_entity - activate an entity and its ancestors if necessary. - * @entity: the entity to activate. + * __bfq_requeue_entity - handle requeueing or repositioning of an entity. + * @entity: the entity being requeued or repositioned. + * + * Requeueing is needed if this entity stops being served, which + * happens if a leaf descendant entity has expired. On the other hand, + * repositioning is needed if the next_inservice_entity for the child + * entity has changed. See the comments inside the function for + * details. * - * Activate @entity and all the entities on the path from it to the root. + * Basically, this function: 1) removes entity from its active tree if + * present there, 2) updates the timestamps of entity and 3) inserts + * entity back into its active tree (in the new, right position for + * the new values of the timestamps). */ -static void bfq_activate_entity(struct bfq_entity *entity) +static void __bfq_requeue_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + BUG_ON(!sd); + BUG_ON(!st); + + BUG_ON(entity != sd->in_service_entity && + entity->tree != &st->active); + + if (entity == sd->in_service_entity) { + /* + * We are requeueing the current in-service entity, + * which may have to be done for one of the following + * reasons: + * - entity represents the in-service queue, and the + * in-service queue is being requeued after an + * expiration; + * - entity represents a group, and its budget has + * changed because one of its child entities has + * just been either activated or requeued for some + * reason; the timestamps of the entity need then to + * be updated, and the entity needs to be enqueued + * or repositioned accordingly. + * + * In particular, before requeueing, the start time of + * the entity must be moved forward to account for the + * service that the entity has received while in + * service. This is done by the next instructions. The + * finish time will then be updated according to this + * new value of the start time, and to the budget of + * the entity. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + BUG_ON(entity->tree && entity->tree != &st->active); + /* + * In addition, if the entity had more than one child + * when set in service, then was not extracted from + * the active tree. This implies that the position of + * the entity in the active tree may need to be + * changed now, because we have just updated the start + * time of the entity, and we will update its finish + * time in a moment (the requeueing is then, more + * precisely, a repositioning in this case). To + * implement this repositioning, we: 1) dequeue the + * entity here, 2) update the finish time and + * requeue the entity according to the new + * timestamps below. + */ + if (entity->tree) + bfq_active_extract(st, entity); + } else { /* The entity is already active, and not in service */ + /* + * In this case, this function gets called only if the + * next_in_service entity below this entity has + * changed, and this change has caused the budget of + * this entity to change, which, finally implies that + * the finish time of this entity must be + * updated. Such an update may cause the scheduling, + * i.e., the position in the active tree, of this + * entity to change. We handle this change by: 1) + * dequeueing the entity here, 2) updating the finish + * time and requeueing the entity according to the new + * timestamps below. This is the same approach as the + * non-extracted-entity sub-case above. + */ + bfq_active_extract(st, entity); + } + + bfq_update_fin_time_enqueue(entity, st, false); +} + +static void __bfq_activate_requeue_entity(struct bfq_entity *entity, + struct bfq_sched_data *sd, + bool non_blocking_wait_rq) +{ + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (sd->in_service_entity == entity || entity->tree == &st->active) + /* + * in service or already queued on the active tree, + * requeue or reposition + */ + __bfq_requeue_entity(entity); + else + /* + * Not in service and not queued on its active tree: + * the activity is idle and this is a true activation. + */ + __bfq_activate_entity(entity, non_blocking_wait_rq); +} + + +/** + * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, + * and activate, requeue or reposition all ancestors + * for which such an update becomes necessary. + * @entity: the entity to activate. + * @non_blocking_wait_rq: true if this entity was waiting for a request + * @requeue: true if this is a requeue, which implies that bfqq is + * being expired; thus ALL its ancestors stop being served and must + * therefore be requeued + */ +static void bfq_activate_requeue_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq, + bool requeue) { struct bfq_sched_data *sd; for_each_entity(entity) { - __bfq_activate_entity(entity); - + BUG_ON(!entity); sd = entity->sched_data; - if (!bfq_update_next_in_service(sd)) - /* - * No need to propagate the activation to the - * upper entities, as they will be updated when - * the in-service entity is rescheduled. - */ + __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); + + BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && + RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && + RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); + + if (!bfq_update_next_in_service(sd, entity) && !requeue) { + BUG_ON(!sd->next_in_service); break; + } + BUG_ON(!sd->next_in_service); } } /** * __bfq_deactivate_entity - deactivate an entity from its service tree. * @entity: the entity to deactivate. - * @requeue: if false, the entity will not be put into the idle tree. - * - * Deactivate an entity, independently from its previous state. If the - * entity was not on a service tree just return, otherwise if it is on - * any scheduler tree, extract it from that tree, and if necessary - * and if the caller did not specify @requeue, put it on the idle tree. + * @ins_into_idle_tree: if false, the entity will not be put into the + * idle tree. * - * Return %1 if the caller should update the entity hierarchy, i.e., - * if the entity was in service or if it was the next_in_service for - * its sched_data; return %0 otherwise. + * Deactivates an entity, independently from its previous state. Must + * be invoked only if entity is on a service tree. Extracts the entity + * from that tree, and if necessary and allowed, puts it on the idle + * tree. */ -static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +static bool __bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree) { struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st; - int was_in_service; - int ret = 0; - - if (sd == NULL || !entity->on_st) /* never activated, or inactive */ - return 0; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + bool was_in_service = entity == sd->in_service_entity; - st = bfq_entity_service_tree(entity); - was_in_service = entity == sd->in_service_entity; + if (!entity->on_st) { /* entity never activated, or already inactive */ + BUG_ON(entity == entity->sched_data->in_service_entity); + return false; + } - BUG_ON(was_in_service && entity->tree); + BUG_ON(was_in_service && entity->tree && entity->tree != &st->active); - if (was_in_service) { + if (was_in_service) bfq_calc_finish(entity, entity->service); - sd->in_service_entity = NULL; - } else if (entity->tree == &st->active) + + if (entity->tree == &st->active) bfq_active_extract(st, entity); - else if (entity->tree == &st->idle) + else if (!was_in_service && entity->tree == &st->idle) bfq_idle_extract(st, entity); else if (entity->tree) BUG(); - if (was_in_service || sd->next_in_service == entity) - ret = bfq_update_next_in_service(sd); - - if (!requeue || !bfq_gt(entity->finish, st->vtime)) + if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) bfq_forget_entity(st, entity); else bfq_idle_insert(st, entity); - BUG_ON(sd->in_service_entity == entity); - BUG_ON(sd->next_in_service == entity); - - return ret; + return true; } /** - * bfq_deactivate_entity - deactivate an entity. + * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. * @entity: the entity to deactivate. - * @requeue: true if the entity can be put on the idle tree + * @ins_into_idle_tree: true if the entity can be put on the idle tree */ -static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +static void bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree, + bool expiration) { struct bfq_sched_data *sd; - struct bfq_entity *parent; + struct bfq_entity *parent = NULL; for_each_entity_safe(entity, parent) { sd = entity->sched_data; - if (!__bfq_deactivate_entity(entity, requeue)) + BUG_ON(sd == NULL); /* + * It would mean that this is the + * root group. + */ + + BUG_ON(expiration && entity != sd->in_service_entity); + + BUG_ON(entity != sd->in_service_entity && + entity->tree == + &bfq_entity_service_tree(entity)->active && + !sd->next_in_service); + + if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { /* - * The parent entity is still backlogged, and - * we don't need to update it as it is still - * in service. + * Entity is not any tree any more, so, this + * deactivation is a no-op, and there is + * nothing to change for upper-level entities + * (in case of expiration, this can never + * happen). */ - break; + BUG_ON(expiration); /* + * entity cannot be already out of + * any tree + */ + return; + } - if (sd->next_in_service) + if (sd->next_in_service == entity) /* - * The parent entity is still backlogged and - * the budgets on the path towards the root - * need to be updated. + * entity was the next_in_service entity, + * then, since entity has just been + * deactivated, a new one must be found. */ - goto update; + bfq_update_next_in_service(sd, NULL); + + if (sd->next_in_service) { + /* + * The parent entity is still backlogged, + * because next_in_service is not NULL. So, no + * further upwards deactivation must be + * performed. Yet, next_in_service has + * changed. Then the schedule does need to be + * updated upwards. + */ + BUG_ON(sd->next_in_service == entity); + break; + } /* - * If we reach there the parent is no more backlogged and - * we want to propagate the dequeue upwards. + * If we get here, then the parent is no more + * backlogged and we need to propagate the + * deactivation upwards. Thus let the loop go on. */ - requeue = 1; - } - return; + /* + * Also let parent be queued into the idle tree on + * deactivation, to preserve service guarantees, and + * assuming that who invoked this function does not + * need parent entities too to be removed completely. + */ + ins_into_idle_tree = true; + } -update: + /* + * If the deactivation loop is fully executed, then there are + * no more entities to touch and next loop is not executed at + * all. Otherwise, requeue remaining entities if they are + * about to stop receiving service, or reposition them if this + * is not the case. + */ entity = parent; for_each_entity(entity) { - __bfq_activate_entity(entity); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + /* + * Invoke __bfq_requeue_entity on entity, even if + * already active, to requeue/reposition it in the + * active tree (because sd->next_in_service has + * changed) + */ + __bfq_requeue_entity(entity); sd = entity->sched_data; - if (!bfq_update_next_in_service(sd)) + BUG_ON(expiration && sd->in_service_entity != entity); + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "invoking udpdate_next for this queue"); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "invoking udpdate_next for this entity"); + } +#endif + if (!bfq_update_next_in_service(sd, entity) && + !expiration) + /* + * next_in_service unchanged or not causing + * any change in entity->parent->sd, and no + * requeueing needed for expiration: stop + * here. + */ break; } } /** - * bfq_update_vtime - update vtime if necessary. + * bfq_calc_vtime_jump - compute the value to which the vtime should jump, + * if needed, to have at least one entity eligible. * @st: the service tree to act upon. * - * If necessary update the service tree vtime to have at least one - * eligible entity, skipping to its start time. Assumes that the - * active tree of the device is not empty. - * - * NOTE: this hierarchical implementation updates vtimes quite often, - * we may end up with reactivated processes getting timestamps after a - * vtime skip done because we needed a ->first_active entity on some - * intermediate node. + * Assumes that st is not empty. */ -static void bfq_update_vtime(struct bfq_service_tree *st) +static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) { - struct bfq_entity *entry; - struct rb_node *node = st->active.rb_node; + struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); + + if (bfq_gt(root_entity->min_start, st->vtime)) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); - entry = rb_entry(node, struct bfq_entity, rb_node); - if (bfq_gt(entry->min_start, st->vtime)) { - st->vtime = entry->min_start; + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_vtime_jump: new value %llu", + root_entity->min_start); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(root_entity, struct bfq_group, + entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_vtime_jump: new value %llu", + root_entity->min_start); + } +#endif + return root_entity->min_start; + } + return st->vtime; +} + +static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) +{ + if (new_value > st->vtime) { + st->vtime = new_value; bfq_forget_idle(st); } } @@ -952,6 +1462,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st) * bfq_first_active_entity - find the eligible entity with * the smallest finish time * @st: the service tree to select from. + * @vtime: the system virtual to use as a reference for eligibility * * This function searches the first schedulable entity, starting from the * root of the tree and going on the left every time on this side there is @@ -959,7 +1470,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st) * the right is followed only if a) the left subtree contains no eligible * entities and b) no eligible entity has been found yet. */ -static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, + u64 vtime) { struct bfq_entity *entry, *first = NULL; struct rb_node *node = st->active.rb_node; @@ -967,15 +1479,15 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) while (node) { entry = rb_entry(node, struct bfq_entity, rb_node); left: - if (!bfq_gt(entry->start, st->vtime)) + if (!bfq_gt(entry->start, vtime)) first = entry; - BUG_ON(bfq_gt(entry->min_start, st->vtime)); + BUG_ON(bfq_gt(entry->min_start, vtime)); if (node->rb_left) { entry = rb_entry(node->rb_left, struct bfq_entity, rb_node); - if (!bfq_gt(entry->min_start, st->vtime)) { + if (!bfq_gt(entry->min_start, vtime)) { node = node->rb_left; goto left; } @@ -993,31 +1505,84 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) * __bfq_lookup_next_entity - return the first eligible entity in @st. * @st: the service tree. * - * Update the virtual time in @st and return the first eligible entity - * it contains. + * If there is no in-service entity for the sched_data st belongs to, + * then return the entity that will be set in service if: + * 1) the parent entity this st belongs to is set in service; + * 2) no entity belonging to such parent entity undergoes a state change + * that would influence the timestamps of the entity (e.g., becomes idle, + * becomes backlogged, changes its budget, ...). + * + * In this first case, update the virtual time in @st too (see the + * comments on this update inside the function). + * + * In constrast, if there is an in-service entity, then return the + * entity that would be set in service if not only the above + * conditions, but also the next one held true: the currently + * in-service entity, on expiration, + * 1) gets a finish time equal to the current one, or + * 2) is not eligible any more, or + * 3) is idle. */ -static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, - bool force) +static struct bfq_entity * +__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service +#if 0 + , bool force +#endif + ) { - struct bfq_entity *entity, *new_next_in_service = NULL; + struct bfq_entity *entity +#if 0 + , *new_next_in_service = NULL +#endif + ; + u64 new_vtime; + struct bfq_queue *bfqq; if (RB_EMPTY_ROOT(&st->active)) return NULL; - bfq_update_vtime(st); - entity = bfq_first_active_entity(st); - BUG_ON(bfq_gt(entity->start, st->vtime)); + /* + * Get the value of the system virtual time for which at + * least one entity is eligible. + */ + new_vtime = bfq_calc_vtime_jump(st); /* - * If the chosen entity does not match with the sched_data's - * next_in_service and we are forcedly serving the IDLE priority - * class tree, bubble up budget update. + * If there is no in-service entity for the sched_data this + * active tree belongs to, then push the system virtual time + * up to the value that guarantees that at least one entity is + * eligible. If, instead, there is an in-service entity, then + * do not make any such update, because there is already an + * eligible entity, namely the in-service one (even if the + * entity is not on st, because it was extracted when set in + * service). */ - if (unlikely(force && entity != entity->sched_data->next_in_service)) { - new_next_in_service = entity; - for_each_entity(new_next_in_service) - bfq_update_budget(new_next_in_service); + if (!in_service) + bfq_update_vtime(st, new_vtime); + + entity = bfq_first_active_entity(st, new_vtime); + BUG_ON(bfq_gt(entity->start, new_vtime)); + + /* Log some information */ + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); } +#endif + + BUG_ON(!entity); return entity; } @@ -1025,50 +1590,81 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, /** * bfq_lookup_next_entity - return the first eligible entity in @sd. * @sd: the sched_data. - * @extract: if true the returned entity will be also extracted from @sd. * - * NOTE: since we cache the next_in_service entity at each level of the - * hierarchy, the complexity of the lookup can be decreased with - * absolutely no effort just returning the cached next_in_service value; - * we prefer to do full lookups to test the consistency of * the data - * structures. + * This function is invoked when there has been a change in the trees + * for sd, and we need know what is the new next entity after this + * change. */ -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, - int extract, - struct bfq_data *bfqd) +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) { struct bfq_service_tree *st = sd->service_tree; - struct bfq_entity *entity; - int i = 0; - - BUG_ON(sd->in_service_entity); + struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); + struct bfq_entity *entity = NULL; + struct bfq_queue *bfqq; + int class_idx = 0; - if (bfqd && - jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { - entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, - true); - if (entity) { - i = BFQ_IOPRIO_CLASSES - 1; - bfqd->bfq_class_idle_last_service = jiffies; - sd->next_in_service = entity; - } + BUG_ON(!sd); + BUG_ON(!st); + /* + * Choose from idle class, if needed to guarantee a minimum + * bandwidth to this class (and if there is some active entity + * in idle class). This should also mitigate + * priority-inversion problems in case a low priority task is + * holding file system resources. + */ + if (time_is_before_jiffies(sd->bfq_class_idle_last_service + + BFQ_CL_IDLE_TIMEOUT)) { + if (!RB_EMPTY_ROOT(&idle_class_st->active)) + class_idx = BFQ_IOPRIO_CLASSES - 1; + /* About to be served if backlogged, or not yet backlogged */ + sd->bfq_class_idle_last_service = jiffies; } - for (; i < BFQ_IOPRIO_CLASSES; i++) { - entity = __bfq_lookup_next_entity(st + i, false); - if (entity) { - if (extract) { - bfq_check_next_in_service(sd, entity); - bfq_active_extract(st + i, entity); - sd->in_service_entity = entity; - sd->next_in_service = NULL; - } + + /* + * Find the next entity to serve for the highest-priority + * class, unless the idle class needs to be served. + */ + for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { + entity = __bfq_lookup_next_entity(st + class_idx, + sd->in_service_entity); + + if (entity) break; - } } + BUG_ON(!entity && + (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || + !RB_EMPTY_ROOT(&(st+2)->active))); + + if (!entity) + return NULL; + + /* Log some information */ + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", + st + class_idx, class_idx); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "chosen from st %p %d", + st + class_idx, class_idx); + } +#endif + return entity; } +static bool next_queue_may_preempt(struct bfq_data *bfqd) +{ + struct bfq_sched_data *sd = &bfqd->root_group->sched_data; + + return sd->next_in_service != sd->in_service_entity; +} + /* * Get next queue for service. */ @@ -1083,58 +1679,208 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) if (bfqd->busy_queues == 0) return NULL; + /* + * Traverse the path from the root to the leaf entity to + * serve. Set in service all the entities visited along the + * way. + */ sd = &bfqd->root_group->sched_data; for (; sd ; sd = entity->my_sched_data) { - entity = bfq_lookup_next_entity(sd, 1, bfqd); - BUG_ON(!entity); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (entity) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: lookup in this group"); + if (!sd->next_in_service) + pr_crit("get_next_queue: lookup in this group"); + } else { + bfq_log_bfqg(bfqd, bfqd->root_group, + "get_next_queue: lookup in root group"); + if (!sd->next_in_service) + pr_crit("get_next_queue: lookup in root group"); + } +#endif + + BUG_ON(!sd->next_in_service); + + /* + * WARNING. We are about to set the in-service entity + * to sd->next_in_service, i.e., to the (cached) value + * returned by bfq_lookup_next_entity(sd) the last + * time it was invoked, i.e., the last time when the + * service order in sd changed as a consequence of the + * activation or deactivation of an entity. In this + * respect, if we execute bfq_lookup_next_entity(sd) + * in this very moment, it may, although with low + * probability, yield a different entity than that + * pointed to by sd->next_in_service. This rare event + * happens in case there was no CLASS_IDLE entity to + * serve for sd when bfq_lookup_next_entity(sd) was + * invoked for the last time, while there is now one + * such entity. + * + * If the above event happens, then the scheduling of + * such entity in CLASS_IDLE is postponed until the + * service of the sd->next_in_service entity + * finishes. In fact, when the latter is expired, + * bfq_lookup_next_entity(sd) gets called again, + * exactly to update sd->next_in_service. + */ + + /* Make next_in_service entity become in_service_entity */ + entity = sd->next_in_service; + sd->in_service_entity = entity; + + /* + * Reset the accumulator of the amount of service that + * the entity is about to receive. + */ entity->service = 0; + + /* + * If entity is no longer a candidate for next + * service, then we extract it from its active tree, + * for the following reason. To further boost the + * throughput in some special case, BFQ needs to know + * which is the next candidate entity to serve, while + * there is already an entity in service. In this + * respect, to make it easy to compute/update the next + * candidate entity to serve after the current + * candidate has been set in service, there is a case + * where it is necessary to extract the current + * candidate from its service tree. Such a case is + * when the entity just set in service cannot be also + * a candidate for next service. Details about when + * this conditions holds are reported in the comments + * on the function bfq_no_longer_next_in_service() + * invoked below. + */ + if (bfq_no_longer_next_in_service(entity)) + bfq_active_extract(bfq_entity_service_tree(entity), + entity); + + /* + * For the same reason why we may have just extracted + * entity from its active tree, we may need to update + * next_in_service for the sched_data of entity too, + * regardless of whether entity has been extracted. + * In fact, even if entity has not been extracted, a + * descendant entity may get extracted. Such an event + * would cause a change in next_in_service for the + * level of the descendant entity, and thus possibly + * back to upper levels. + * + * We cannot perform the resulting needed update + * before the end of this loop, because, to know which + * is the correct next-to-serve candidate entity for + * each level, we need first to find the leaf entity + * to set in service. In fact, only after we know + * which is the next-to-serve leaf entity, we can + * discover whether the parent entity of the leaf + * entity becomes the next-to-serve, and so on. + */ + + /* Log some information */ + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "get_next_queue: this queue, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: this entity, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); + } +#endif + } + BUG_ON(!entity); bfqq = bfq_entity_to_bfqq(entity); BUG_ON(!bfqq); + /* + * We can finally update all next-to-serve entities along the + * path from the leaf entity just set in service to the root. + */ + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->sched_data; + + if(!bfq_update_next_in_service(sd, NULL)) + break; + } + return bfqq; } static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) { + struct bfq_entity *entity = &bfqd->in_service_queue->entity; + if (bfqd->in_service_bic) { put_io_context(bfqd->in_service_bic->icq.ioc); bfqd->in_service_bic = NULL; } + bfq_clear_bfqq_wait_request(bfqd->in_service_queue); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqd->in_service_queue = NULL; - del_timer(&bfqd->idle_slice_timer); + + /* + * When this function is called, all in-service entities have + * been properly deactivated or requeued, so we can safely + * execute the final step: reset in_service_entity along the + * path from entity to the root. + */ + for_each_entity(entity) + entity->sched_data->in_service_entity = NULL; } static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int requeue) + bool ins_into_idle_tree, bool expiration) { struct bfq_entity *entity = &bfqq->entity; - if (bfqq == bfqd->in_service_queue) - __bfq_bfqd_reset_in_service(bfqd); - - bfq_deactivate_entity(entity, requeue); + bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); } static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + BUG_ON(bfqq == bfqd->in_service_queue); + BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && + entity->on_st); - bfq_activate_entity(entity); + bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), + false); + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); +} + +static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_requeue_entity(entity, false, + bfqq == bfqd->in_service_queue); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -#endif /* * Called when the bfqq no longer has requests pending, remove it from - * the service tree. + * the service tree. As a special case, it can be invoked during an + * expiration. */ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int requeue) + bool expiration) { BUG_ON(!bfq_bfqq_busy(bfqq)); BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -1146,27 +1892,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(bfqd->busy_queues == 0); bfqd->busy_queues--; - if (!bfqq->dispatched) { + if (!bfqq->dispatched) bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } - } + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_dequeue(bfqq_group(bfqq)); -#endif - bfq_deactivate_bfqq(bfqd, bfqq, requeue); + BUG_ON(bfqq->entity.budget < 0); + + bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); + + BUG_ON(bfqq->entity.budget < 0); } /* @@ -1184,16 +1923,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues++; - if (!bfqq->dispatched) { + if (!bfqq->dispatched) if (bfqq->wr_coeff == 1) bfq_weights_tree_add(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - bfqd->busy_in_flight_queues++; - if (bfq_bfqq_constantly_seeky(bfqq)) - bfqd->const_seeky_busy_in_flight_queues++; - } - } + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; } diff --git a/block/bfq.h b/block/bfq.h index fcce855..2a2bc30 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. + * BFQ v8r8 for 4.10.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -7,7 +7,9 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2017 Paolo Valente */ #ifndef _BFQ_H @@ -28,20 +30,21 @@ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_WEIGHT_LEGACY_DFL 100 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE +/* + * Soft real-time applications are extremely more latency sensitive + * than interactive ones. Over-raise the weight of the former to + * privilege them against the latter. + */ +#define BFQ_SOFTRT_WEIGHT_FACTOR 100 + struct bfq_entity; /** * struct bfq_service_tree - per ioprio_class service tree. - * @active: tree for active entities (i.e., those backlogged). - * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). - * @first_idle: idle entity with minimum F_i. - * @last_idle: idle entity with maximum F_i. - * @vtime: scheduler virtual time. - * @wsum: scheduler weight sum; active and idle entities contribute to it. * * Each service tree represents a B-WF2Q+ scheduler on its own. Each * ioprio_class has its own independent scheduler, and so its own @@ -49,27 +52,28 @@ struct bfq_entity; * of the containing bfqd. */ struct bfq_service_tree { + /* tree for active entities (i.e., those backlogged) */ struct rb_root active; + /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ struct rb_root idle; - struct bfq_entity *first_idle; - struct bfq_entity *last_idle; + struct bfq_entity *first_idle; /* idle entity with minimum F_i */ + struct bfq_entity *last_idle; /* idle entity with maximum F_i */ - u64 vtime; + u64 vtime; /* scheduler virtual time */ + /* scheduler weight sum; active and idle entities contribute to it */ unsigned long wsum; }; /** * struct bfq_sched_data - multi-class scheduler. - * @in_service_entity: entity in service. - * @next_in_service: head-of-the-line entity in the scheduler. - * @service_tree: array of service trees, one per ioprio_class. * * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as - * an intermediate queue on a hierarchical setup. - * @next_in_service points to the active entity of the sched_data - * service trees that will be scheduled next. + * ioprio_classes, and can be used either as a toplevel queue or as an + * intermediate queue on a hierarchical setup. @next_in_service + * points to the active entity of the sched_data service trees that + * will be scheduled next. It is used to reduce the number of steps + * needed for each hierarchical-schedule update. * * The supported ioprio_classes are the same as in CFQ, in descending * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. @@ -79,48 +83,32 @@ struct bfq_service_tree { * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_sched_data { - struct bfq_entity *in_service_entity; + struct bfq_entity *in_service_entity; /* entity in service */ + /* head-of-the-line entity in the scheduler (see comments above) */ struct bfq_entity *next_in_service; + /* array of service trees, one per ioprio_class */ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; + /* last time CLASS_IDLE was served */ + unsigned long bfq_class_idle_last_service; + }; /** * struct bfq_weight_counter - counter of the number of all active entities * with a given weight. - * @weight: weight of the entities that this counter refers to. - * @num_active: number of active entities with this weight. - * @weights_node: weights tree member (see bfq_data's @queue_weights_tree - * and @group_weights_tree). */ struct bfq_weight_counter { - short int weight; - unsigned int num_active; + unsigned int weight; /* weight of the entities this counter refers to */ + unsigned int num_active; /* nr of active entities with this weight */ + /* + * Weights tree member (see bfq_data's @queue_weights_tree and + * @group_weights_tree) + */ struct rb_node weights_node; }; /** * struct bfq_entity - schedulable entity. - * @rb_node: service_tree member. - * @weight_counter: pointer to the weight counter associated with this entity. - * @on_st: flag, true if the entity is on a tree (either the active or - * the idle one of its service_tree). - * @finish: B-WF2Q+ finish timestamp (aka F_i). - * @start: B-WF2Q+ start timestamp (aka S_i). - * @tree: tree the entity is enqueued into; %NULL if not on a tree. - * @min_start: minimum start time of the (active) subtree rooted at - * this entity; used for O(log N) lookups into active trees. - * @service: service received during the last round of service. - * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. - * @weight: weight of the queue - * @parent: parent entity, for hierarchical scheduling. - * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the - * associated scheduler queue, %NULL on leaf nodes. - * @sched_data: the scheduler queue this entity belongs to. - * @ioprio: the ioprio in use. - * @new_weight: when a weight change is requested, the new weight value. - * @orig_weight: original weight, used to implement weight boosting - * @prio_changed: flag, true when the user requested a weight, ioprio or - * ioprio_class change. * * A bfq_entity is used to represent either a bfq_queue (leaf node in the * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each @@ -147,27 +135,52 @@ struct bfq_weight_counter { * containing bfqd. */ struct bfq_entity { - struct rb_node rb_node; + struct rb_node rb_node; /* service_tree member */ + /* pointer to the weight counter associated with this entity */ struct bfq_weight_counter *weight_counter; - int on_st; + /* + * Flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree) or is in service. + */ + bool on_st; - u64 finish; - u64 start; + u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ + u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ + /* tree the entity is enqueued into; %NULL if not on a tree */ struct rb_root *tree; + /* + * minimum start time of the (active) subtree rooted at this + * entity; used for O(log N) lookups into active trees + */ u64 min_start; - int service, budget; - unsigned short weight, new_weight; - unsigned short orig_weight; + /* amount of service received during the last service slot */ + int service; + + /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ + int budget; + + unsigned int weight; /* weight of the queue */ + unsigned int new_weight; /* next weight if a change is in progress */ + + /* original weight, used to implement weight boosting */ + unsigned int orig_weight; + /* parent entity, for hierarchical scheduling */ struct bfq_entity *parent; + /* + * For non-leaf nodes in the hierarchy, the associated + * scheduler queue, %NULL on leaf nodes. + */ struct bfq_sched_data *my_sched_data; + /* the scheduler queue this entity belongs to */ struct bfq_sched_data *sched_data; + /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; }; @@ -175,56 +188,6 @@ struct bfq_group; /** * struct bfq_queue - leaf schedulable entity. - * @ref: reference counter. - * @bfqd: parent bfq_data. - * @new_ioprio: when an ioprio change is requested, the new ioprio value. - * @ioprio_class: the ioprio_class in use. - * @new_ioprio_class: when an ioprio_class change is requested, the new - * ioprio_class value. - * @new_bfqq: shared bfq_queue if queue is cooperating with - * one or more other queues. - * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). - * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). - * @sort_list: sorted list of pending requests. - * @next_rq: if fifo isn't expired, next request to serve. - * @queued: nr of requests queued in @sort_list. - * @allocated: currently allocated requests. - * @meta_pending: pending metadata requests. - * @fifo: fifo list of requests in sort_list. - * @entity: entity representing this queue in the scheduler. - * @max_budget: maximum budget allowed from the feedback mechanism. - * @budget_timeout: budget expiration (in jiffies). - * @dispatched: number of requests on the dispatch list or inside driver. - * @flags: status flags. - * @bfqq_list: node for active/idle bfqq list inside our bfqd. - * @burst_list_node: node for the device's burst list. - * @seek_samples: number of seeks sampled - * @seek_total: sum of the distances of the seeks sampled - * @seek_mean: mean seek distance - * @last_request_pos: position of the last request enqueued - * @requests_within_timer: number of consecutive pairs of request completion - * and arrival, such that the queue becomes idle - * after the completion, but the next request arrives - * within an idle time slice; used only if the queue's - * IO_bound has been cleared. - * @pid: pid of the process owning the queue, used for logging purposes. - * @last_wr_start_finish: start time of the current weight-raising period if - * the @bfq-queue is being weight-raised, otherwise - * finish time of the last weight-raising period - * @wr_cur_max_time: current max raising time for this queue - * @soft_rt_next_start: minimum time instant such that, only if a new - * request is enqueued after this time instant in an - * idle @bfq_queue with no outstanding requests, then - * the task associated with the queue it is deemed as - * soft real-time (see the comments to the function - * bfq_bfqq_softrt_next_start()) - * @last_idle_bklogged: time of the last transition of the @bfq_queue from - * idle to backlogged - * @service_from_backlogged: cumulative service received from the @bfq_queue - * since the last transition from idle to - * backlogged - * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the - * queue is shared * * A bfq_queue is a leaf request queue; it can be associated with an * io_context or more, if it is async or shared between cooperating @@ -235,117 +198,175 @@ struct bfq_group; * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_queue { - atomic_t ref; + /* reference counter */ + int ref; + /* parent bfq_data */ struct bfq_data *bfqd; - unsigned short ioprio, new_ioprio; - unsigned short ioprio_class, new_ioprio_class; + /* current ioprio and ioprio class */ + unsigned short ioprio, ioprio_class; + /* next ioprio and ioprio class if a change is in progress */ + unsigned short new_ioprio, new_ioprio_class; - /* fields for cooperating queues handling */ + /* + * Shared bfq_queue if queue is cooperating with one or more + * other queues. + */ struct bfq_queue *new_bfqq; + /* request-position tree member (see bfq_group's @rq_pos_tree) */ struct rb_node pos_node; + /* request-position tree root (see bfq_group's @rq_pos_tree) */ struct rb_root *pos_root; + /* sorted list of pending requests */ struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ struct request *next_rq; + /* number of sync and async requests queued */ int queued[2]; + /* number of sync and async requests currently allocated */ int allocated[2]; + /* number of pending metadata requests */ int meta_pending; + /* fifo list of requests in sort_list */ struct list_head fifo; + /* entity representing this queue in the scheduler */ struct bfq_entity entity; + /* maximum budget allowed from the feedback mechanism */ int max_budget; + /* budget expiration (in jiffies) */ unsigned long budget_timeout; + /* number of requests on the dispatch list or inside driver */ int dispatched; - unsigned int flags; + unsigned int flags; /* status flags.*/ + /* node for active/idle bfqq list inside parent bfqd */ struct list_head bfqq_list; + /* bit vector: a 1 for each seeky requests in history */ + u32 seek_history; + + /* node for the device's burst list */ struct hlist_node burst_list_node; - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; + /* position of the last request enqueued */ sector_t last_request_pos; + /* Number of consecutive pairs of request completion and + * arrival, such that the queue becomes idle after the + * completion, but the next request arrives within an idle + * time slice; used only if the queue's IO_bound flag has been + * cleared. + */ unsigned int requests_within_timer; + /* pid of the process owning the queue, used for logging purposes */ pid_t pid; + + /* + * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL + * if the queue is shared. + */ struct bfq_io_cq *bic; - /* weight-raising fields */ + /* current maximum weight-raising time for this queue */ unsigned long wr_cur_max_time; + /* + * Minimum time instant such that, only if a new request is + * enqueued after this time instant in an idle @bfq_queue with + * no outstanding requests, then the task associated with the + * queue it is deemed as soft real-time (see the comments on + * the function bfq_bfqq_softrt_next_start()) + */ unsigned long soft_rt_next_start; + /* + * Start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period. + */ unsigned long last_wr_start_finish; + /* factor by which the weight of this queue is multiplied */ unsigned int wr_coeff; + /* + * Time of the last transition of the @bfq_queue from idle to + * backlogged. + */ unsigned long last_idle_bklogged; + /* + * Cumulative service received from the @bfq_queue since the + * last transition from idle to backlogged. + */ unsigned long service_from_backlogged; + /* + * Value of wr start time when switching to soft rt + */ + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ }; /** * struct bfq_ttime - per process thinktime stats. - * @ttime_total: total process thinktime - * @ttime_samples: number of thinktime samples - * @ttime_mean: average process thinktime */ struct bfq_ttime { - unsigned long last_end_request; + u64 last_end_request; /* completion time of last request */ + + u64 ttime_total; /* total process thinktime */ + unsigned long ttime_samples; /* number of thinktime samples */ + u64 ttime_mean; /* average process thinktime */ - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; }; /** * struct bfq_io_cq - per (request_queue, io_context) structure. - * @icq: associated io_cq structure - * @bfqq: array of two process queues, the sync and the async - * @ttime: associated @bfq_ttime struct - * @ioprio: per (request_queue, blkcg) ioprio. - * @blkcg_id: id of the blkcg the related io_cq belongs to. - * @wr_time_left: snapshot of the time left before weight raising ends - * for the sync queue associated to this process; this - * snapshot is taken to remember this value while the weight - * raising is suspended because the queue is merged with a - * shared queue, and is used to set @raising_cur_max_time - * when the queue is split from the shared queue and its - * weight is raised again - * @saved_idle_window: same purpose as the previous field for the idle - * window - * @saved_IO_bound: same purpose as the previous two fields for the I/O - * bound classification of a queue - * @saved_in_large_burst: same purpose as the previous fields for the - * value of the field keeping the queue's belonging - * to a large burst - * @was_in_burst_list: true if the queue belonged to a burst list - * before its merge with another cooperating queue - * @cooperations: counter of consecutive successful queue merges underwent - * by any of the process' @bfq_queues - * @failed_cooperations: counter of consecutive failed queue merges of any - * of the process' @bfq_queues */ struct bfq_io_cq { + /* associated io_cq structure */ struct io_cq icq; /* must be the first member */ + /* array of two process queues, the sync and the async */ struct bfq_queue *bfqq[2]; + /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* per (request_queue, blkcg) ioprio */ int ioprio; - #ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ + uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif - unsigned int wr_time_left; + /* + * Snapshot of the idle window before merging; taken to + * remember this value while the queue is merged, so as to be + * able to restore it in case of split. + */ bool saved_idle_window; + /* + * Same purpose as the previous two fields for the I/O bound + * classification of a queue. + */ bool saved_IO_bound; + /* + * Same purpose as the previous fields for the value of the + * field keeping the queue's belonging to a large burst + */ bool saved_in_large_burst; + /* + * True if the queue belonged to a burst list before its merge + * with another cooperating queue. + */ bool was_in_burst_list; - unsigned int cooperations; - unsigned int failed_cooperations; + /* + * Similar to previous fields: save wr information. + */ + unsigned long saved_wr_coeff; + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; + unsigned int saved_wr_cur_max_time; }; enum bfq_device_speed { @@ -354,224 +375,232 @@ enum bfq_device_speed { }; /** - * struct bfq_data - per device data structure. - * @queue: request queue for the managed device. - * @root_group: root bfq_group for the device. - * @active_numerous_groups: number of bfq_groups containing more than one - * active @bfq_entity. - * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by - * weight. Used to keep track of whether all @bfq_queues - * have the same weight. The tree contains one counter - * for each distinct weight associated to some active - * and not weight-raised @bfq_queue (see the comments to - * the functions bfq_weights_tree_[add|remove] for - * further details). - * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted - * by weight. Used to keep track of whether all - * @bfq_groups have the same weight. The tree contains - * one counter for each distinct weight associated to - * some active @bfq_group (see the comments to the - * functions bfq_weights_tree_[add|remove] for further - * details). - * @busy_queues: number of bfq_queues containing requests (including the - * queue in service, even if it is idling). - * @busy_in_flight_queues: number of @bfq_queues containing pending or - * in-flight requests, plus the @bfq_queue in - * service, even if idle but waiting for the - * possible arrival of its next sync request. This - * field is updated only if the device is rotational, - * but used only if the device is also NCQ-capable. - * The reason why the field is updated also for non- - * NCQ-capable rotational devices is related to the - * fact that the value of @hw_tag may be set also - * later than when busy_in_flight_queues may need to - * be incremented for the first time(s). Taking also - * this possibility into account, to avoid unbalanced - * increments/decrements, would imply more overhead - * than just updating busy_in_flight_queues - * regardless of the value of @hw_tag. - * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues - * (that is, seeky queues that expired - * for budget timeout at least once) - * containing pending or in-flight - * requests, including the in-service - * @bfq_queue if constantly seeky. This - * field is updated only if the device - * is rotational, but used only if the - * device is also NCQ-capable (see the - * comments to @busy_in_flight_queues). - * @wr_busy_queues: number of weight-raised busy @bfq_queues. - * @queued: number of queued requests. - * @rq_in_driver: number of requests dispatched and waiting for completion. - * @sync_flight: number of sync requests in the driver. - * @max_rq_in_driver: max number of reqs in driver in the last - * @hw_tag_samples completed requests. - * @hw_tag_samples: nr of samples used to calculate hw_tag. - * @hw_tag: flag set to one if the driver is showing a queueing behavior. - * @budgets_assigned: number of budgets assigned. - * @idle_slice_timer: timer set when idling for the next sequential request - * from the queue in service. - * @unplug_work: delayed work to restart dispatching on the request queue. - * @in_service_queue: bfq_queue in service. - * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. - * @last_position: on-disk position of the last served request. - * @last_budget_start: beginning of the last budget. - * @last_idling_start: beginning of the last idle slice. - * @peak_rate: peak transfer rate observed for a budget. - * @peak_rate_samples: number of samples used to calculate @peak_rate. - * @bfq_max_budget: maximum budget allotted to a bfq_queue before - * rescheduling. - * @active_list: list of all the bfq_queues active on the device. - * @idle_list: list of all the bfq_queues idle on the device. - * @bfq_fifo_expire: timeout for async/sync requests; when it expires - * requests are served in fifo order. - * @bfq_back_penalty: weight of backward seeks wrt forward ones. - * @bfq_back_max: maximum allowed backward seek. - * @bfq_slice_idle: maximum idling time. - * @bfq_user_max_budget: user-configured max budget value - * (0 for auto-tuning). - * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to - * async queues. - * @bfq_timeout: timeout for bfq_queues to consume their budget; used to - * to prevent seeky queues to impose long latencies to well - * behaved ones (this also implies that seeky queues cannot - * receive guarantees in the service domain; after a timeout - * they are charged for the whole allocated budget, to try - * to preserve a behavior reasonably fair among them, but - * without service-domain guarantees). - * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is - * no more granted any weight-raising. - * @bfq_failed_cooperations: number of consecutive failed cooperation - * chances after which weight-raising is restored - * to a queue subject to more than bfq_coop_thresh - * queue merges. - * @bfq_requests_within_timer: number of consecutive requests that must be - * issued within the idle time slice to set - * again idling to a queue which was marked as - * non-I/O-bound (see the definition of the - * IO_bound flag for further details). - * @last_ins_in_burst: last time at which a queue entered the current - * burst of queues being activated shortly after - * each other; for more details about this and the - * following parameters related to a burst of - * activations, see the comments to the function - * @bfq_handle_burst. - * @bfq_burst_interval: reference time interval used to decide whether a - * queue has been activated shortly after - * @last_ins_in_burst. - * @burst_size: number of queues in the current burst of queue activations. - * @bfq_large_burst_thresh: maximum burst size above which the current - * queue-activation burst is deemed as 'large'. - * @large_burst: true if a large queue-activation burst is in progress. - * @burst_list: head of the burst list (as for the above fields, more details - * in the comments to the function bfq_handle_burst). - * @low_latency: if set to true, low-latency heuristics are enabled. - * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised - * queue is multiplied. - * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). - * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. - * @bfq_wr_min_idle_time: minimum idle period after which weight-raising - * may be reactivated for a queue (in jiffies). - * @bfq_wr_min_inter_arr_async: minimum period between request arrivals - * after which weight-raising may be - * reactivated for an already busy queue - * (in jiffies). - * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, - * sectors per seconds. - * @RT_prod: cached value of the product R*T used for computing the maximum - * duration of the weight raising automatically. - * @device_speed: device-speed class for the low-latency heuristic. - * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * struct bfq_data - per-device data structure. * * All the fields are protected by the @queue lock. */ struct bfq_data { + /* request queue for the device */ struct request_queue *queue; + /* root bfq_group for the device */ struct bfq_group *root_group; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - int active_numerous_groups; -#endif - + /* + * rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active and not + * weight-raised @bfq_queue (see the comments to the functions + * bfq_weights_tree_[add|remove] for further details). + */ struct rb_root queue_weights_tree; + /* + * rbtree of non-queue @bfq_entity weight counters, sorted by + * weight. Used to keep track of whether all @bfq_groups have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active @bfq_group (see + * the comments to the functions bfq_weights_tree_[add|remove] + * for further details). + */ struct rb_root group_weights_tree; + /* + * Number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + */ int busy_queues; - int busy_in_flight_queues; - int const_seeky_busy_in_flight_queues; + /* number of weight-raised busy @bfq_queues */ int wr_busy_queues; + /* number of queued requests */ int queued; + /* number of requests dispatched and waiting for completion */ int rq_in_driver; - int sync_flight; + /* + * Maximum number of requests in driver in the last + * @hw_tag_samples completed requests. + */ int max_rq_in_driver; + /* number of samples used to calculate hw_tag */ int hw_tag_samples; + /* flag set to one if the driver is showing a queueing behavior */ int hw_tag; + /* number of budgets assigned */ int budgets_assigned; - struct timer_list idle_slice_timer; + /* + * Timer set when idling (waiting) for the next request from + * the queue in service. + */ + struct hrtimer idle_slice_timer; + /* delayed work to restart dispatching on the request queue */ struct work_struct unplug_work; + /* bfq_queue in service */ struct bfq_queue *in_service_queue; + /* bfq_io_cq (bic) associated with the @in_service_queue */ struct bfq_io_cq *in_service_bic; + /* on-disk position of the last served request */ sector_t last_position; + /* time of last request completion (ns) */ + u64 last_completion; + + /* time of first rq dispatch in current observation interval (ns) */ + u64 first_dispatch; + /* time of last rq dispatch in current observation interval (ns) */ + u64 last_dispatch; + + /* beginning of the last budget */ ktime_t last_budget_start; + /* beginning of the last idle slice */ ktime_t last_idling_start; + + /* number of samples in current observation interval */ int peak_rate_samples; - u64 peak_rate; + /* num of samples of seq dispatches in current observation interval */ + u32 sequential_samples; + /* total num of sectors transferred in current observation interval */ + u64 tot_sectors_dispatched; + /* max rq size seen during current observation interval (sectors) */ + u32 last_rq_max_size; + /* time elapsed from first dispatch in current observ. interval (us) */ + u64 delta_from_first; + /* current estimate of device peak rate */ + u32 peak_rate; + + /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; + /* list of all the bfq_queues active on the device */ struct list_head active_list; + /* list of all the bfq_queues idle on the device */ struct list_head idle_list; - unsigned int bfq_fifo_expire[2]; + /* + * Timeout for async/sync requests; when it fires, requests + * are served in fifo order. + */ + u64 bfq_fifo_expire[2]; + /* weight of backward seeks wrt forward ones */ unsigned int bfq_back_penalty; + /* maximum allowed backward seek */ unsigned int bfq_back_max; - unsigned int bfq_slice_idle; - u64 bfq_class_idle_last_service; + /* maximum idling time */ + u32 bfq_slice_idle; + /* user-configured max budget value (0 for auto-tuning) */ int bfq_user_max_budget; - int bfq_max_budget_async_rq; - unsigned int bfq_timeout[2]; - - unsigned int bfq_coop_thresh; - unsigned int bfq_failed_cooperations; + /* + * Timeout for bfq_queues to consume their budget; used to + * prevent seeky queues from imposing long latencies to + * sequential or quasi-sequential ones (this also implies that + * seeky queues cannot receive guarantees in the service + * domain; after a timeout they are charged for the time they + * have been in service, to preserve fairness among them, but + * without service-domain guarantees). + */ + unsigned int bfq_timeout; + + /* + * Number of consecutive requests that must be issued within + * the idle time slice to set again idling to a queue which + * was marked as non-I/O-bound (see the definition of the + * IO_bound flag for further details). + */ unsigned int bfq_requests_within_timer; + /* + * Force device idling whenever needed to provide accurate + * service guarantees, without caring about throughput + * issues. CAVEAT: this may even increase latencies, in case + * of useless idling for processes that did stop doing I/O. + */ + bool strict_guarantees; + + /* + * Last time at which a queue entered the current burst of + * queues being activated shortly after each other; for more + * details about this and the following parameters related to + * a burst of activations, see the comments on the function + * bfq_handle_burst. + */ unsigned long last_ins_in_burst; + /* + * Reference time interval used to decide whether a queue has + * been activated shortly after @last_ins_in_burst. + */ unsigned long bfq_burst_interval; + /* number of queues in the current burst of queue activations */ int burst_size; + + /* common parent entity for the queues in the burst */ + struct bfq_entity *burst_parent_entity; + /* Maximum burst size above which the current queue-activation + * burst is deemed as 'large'. + */ unsigned long bfq_large_burst_thresh; + /* true if a large queue-activation burst is in progress */ bool large_burst; + /* + * Head of the burst list (as for the above fields, more + * details in the comments on the function bfq_handle_burst). + */ struct hlist_head burst_list; + /* if set to true, low-latency heuristics are enabled */ bool low_latency; - - /* parameters of the low_latency heuristics */ + /* + * Maximum factor by which the weight of a weight-raised queue + * is multiplied. + */ unsigned int bfq_wr_coeff; + /* maximum duration of a weight-raising period (jiffies) */ unsigned int bfq_wr_max_time; + + /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; + /* + * Minimum idle period after which weight-raising may be + * reactivated for a queue (in jiffies). + */ unsigned int bfq_wr_min_idle_time; + /* + * Minimum period between request arrivals after which + * weight-raising may be reactivated for an already busy async + * queue (in jiffies). + */ unsigned long bfq_wr_min_inter_arr_async; + + /* Max service-rate for a soft real-time queue, in sectors/sec */ unsigned int bfq_wr_max_softrt_rate; + /* + * Cached value of the product R*T, used for computing the + * maximum duration of weight raising automatically. + */ u64 RT_prod; + /* device-speed class for the low-latency heuristic */ enum bfq_device_speed device_speed; + /* fallback dummy bfqq for extreme OOM conditions */ struct bfq_queue oom_bfqq; }; enum bfqq_state_flags { - BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ + BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* + * waiting for a request + * without idling the device + */ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ BFQ_BFQQ_FLAG_IO_bound, /* * bfqq has timed-out at least once * having consumed at most 2/10 of @@ -581,17 +610,12 @@ enum bfqq_state_flags { * bfqq activated in a large burst, * see comments to bfq_handle_burst. */ - BFQ_BFQQ_FLAG_constantly_seeky, /* - * bfqq has proved to be slow and - * seeky until budget timeout - */ BFQ_BFQQ_FLAG_softrt_update, /* * may need softrt-next-start * update */ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ - BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ + BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ }; #define BFQ_BFQQ_FNS(name) \ @@ -608,28 +632,94 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ } +BFQ_BFQQ_FNS(just_created); BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(must_alloc); BFQ_BFQQ_FNS(fifo_expire); BFQ_BFQQ_FNS(idle_window); BFQ_BFQQ_FNS(sync); -BFQ_BFQQ_FNS(budget_new); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); -BFQ_BFQQ_FNS(constantly_seeky); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); -BFQ_BFQQ_FNS(just_split); BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS /* Logging facilities. */ -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) +#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + pr_crit("bfq%d%c %s " fmt "\n", \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s " fmt "\n", __pbuf, ##args); \ +} while (0) + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ + +#define bfq_log(bfqd, fmt, args...) \ + pr_crit("bfq " fmt "\n", ##args) + +#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ +} while (0) + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log(bfqd, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) +#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ /* Expiration reasons. */ enum bfqq_expiration { @@ -640,15 +730,12 @@ enum bfqq_expiration { BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ + BFQ_BFQQ_PREEMPTED /* preemption in progress */ }; -#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfqg_stats { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -657,12 +744,8 @@ struct bfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; /* sum of number of ios queued across all samples */ struct blkg_stat avg_queue_size_sum; /* count of samples taken for average */ @@ -680,8 +763,10 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; +#endif }; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* * struct bfq_group_data - per-blkcg storage for the blkio subsystem. * @@ -692,7 +777,7 @@ struct bfq_group_data { /* must be the first member */ struct blkcg_policy_data pd; - unsigned short weight; + unsigned int weight; }; /** @@ -712,7 +797,7 @@ struct bfq_group_data { * unused for the root group. Used to know whether there * are groups with more than one active @bfq_entity * (see the comments to the function - * bfq_bfqq_must_not_expire()). + * bfq_bfqq_may_idle()). * @rq_pos_tree: rbtree sorted by next_request position, used when * determining if two or more queues have interleaving * requests (see bfq_find_close_cooperator()). @@ -745,7 +830,6 @@ struct bfq_group { struct rb_root rq_pos_tree; struct bfqg_stats stats; - struct bfqg_stats dead_stats; /* stats pushed from dead children */ }; #else @@ -761,17 +845,38 @@ struct bfq_group { static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); +static unsigned int bfq_class_idx(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + return bfqq ? bfqq->ioprio_class - 1 : + BFQ_DEFAULT_GRP_CLASS - 1; +} + static struct bfq_service_tree * bfq_entity_service_tree(struct bfq_entity *entity) { struct bfq_sched_data *sched_data = entity->sched_data; struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : - BFQ_DEFAULT_GRP_CLASS; + unsigned int idx = bfq_class_idx(entity); BUG_ON(idx >= BFQ_IOPRIO_CLASSES); BUG_ON(sched_data == NULL); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); + } +#endif return sched_data->service_tree + idx; } @@ -791,47 +896,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) return bic->icq.q->elevator->elevator_data; } -/** - * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. - * @ptr: a pointer to a bfqd. - * @flags: storage for the flags to be saved. - * - * This function allows bfqg->bfqd to be protected by the - * queue lock of the bfqd they reference; the pointer is dereferenced - * under RCU, so the storage for bfqd is assured to be safe as long - * as the RCU read side critical section does not end. After the - * bfqd->queue->queue_lock is taken the pointer is rechecked, to be - * sure that no other writer accessed it. If we raced with a writer, - * the function returns NULL, with the queue unlocked, otherwise it - * returns the dereferenced pointer, with the queue locked. - */ -static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) -{ - struct bfq_data *bfqd; - - rcu_read_lock(); - bfqd = rcu_dereference(*(struct bfq_data **)ptr); - - if (bfqd != NULL) { - spin_lock_irqsave(bfqd->queue->queue_lock, *flags); - if (ptr == NULL) - printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); - else if (*ptr == bfqd) - goto out; - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); - } - - bfqd = NULL; -out: - rcu_read_unlock(); - return bfqd; -} - -static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) -{ - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -} - #ifdef CONFIG_BFQ_GROUP_IOSCHED static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) @@ -857,11 +921,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask); + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic); static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#endif static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); #endif /* _BFQ_H */ -- 2.10.0