Discussion:
[dm-devel] [PATCH v3 4/7] block: delete part_round_stats and switch to less precise counting
Mike Snitzer
2018-12-05 20:24:30 UTC
Permalink
From: Mikulas Patocka <***@redhat.com>

We want to convert to per-cpu in_flight counters.

The function part_round_stats needs the in_flight counter every jiffy, it
would be too costly to sum all the percpu variables every jiffy, so it
must be deleted. part_round_stats is used to calculate two counters -
time_in_queue and io_ticks.

time_in_queue can be calculated without part_round_stats, by adding the
duration of the I/O when the I/O ends (the value is almost as exact as the
previously calculated value, except that time for in-progress I/Os is not
counted).

io_ticks can be approximated by increasing the value when I/O is started
or ended and the jiffies value has changed. If the I/Os take less than a
jiffy, the value is as exact as the previously calculated value. If the
I/Os take more than a jiffy, io_ticks can drift behind the previously
calculated value.

Signed-off-by: Mikulas Patocka <***@redhat.com>
Signed-off-by: Mike Snitzer <***@redhat.com>
---
block/bio.c | 24 +++++++++++++--
block/blk-core.c | 62 +++------------------------------------
block/blk-merge.c | 1 -
block/genhd.c | 3 --
block/partition-generic.c | 3 --
include/linux/genhd.h | 3 +-
6 files changed, 26 insertions(+), 70 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 91e398ba57f1..0c2208a5446d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1663,6 +1663,22 @@ void bio_check_pages_dirty(struct bio *bio)
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);

+void update_io_ticks(struct hd_struct *part, unsigned long now)
+{
+ unsigned long stamp;
+again:
+ stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
+ __part_stat_add(part, io_ticks, 1);
+ }
+ }
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ goto again;
+ }
+}
+
void generic_start_io_acct(struct request_queue *q, int op,
unsigned long sectors, struct hd_struct *part)
{
@@ -1670,7 +1686,7 @@ void generic_start_io_acct(struct request_queue *q, int op,

part_stat_lock();

- part_round_stats(q, part);
+ update_io_ticks(part, jiffies);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, sectors[sgrp], sectors);
part_inc_in_flight(q, part, op_is_write(op));
@@ -1682,13 +1698,15 @@ EXPORT_SYMBOL(generic_start_io_acct);
void generic_end_io_acct(struct request_queue *q, int req_op,
struct hd_struct *part, unsigned long start_time)
{
- unsigned long duration = jiffies - start_time;
+ unsigned long now = jiffies;
+ unsigned long duration = now - start_time;
const int sgrp = op_stat_group(req_op);

part_stat_lock();

+ update_io_ticks(part, now);
part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_round_stats(q, part);
+ part_stat_add(part, time_in_queue, duration);
part_dec_in_flight(q, part, op_is_write(req_op));

part_stat_unlock();
diff --git a/block/blk-core.c b/block/blk-core.c
index 734b768c9d9d..268d2b8e9843 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -584,62 +584,6 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
}
EXPORT_SYMBOL(blk_get_request);

-static void part_round_stats_single(struct request_queue *q,
- struct hd_struct *part, unsigned long now,
- unsigned int inflight)
-{
- if (inflight) {
- __part_stat_add(part, time_in_queue,
- inflight * (now - part->stamp));
- __part_stat_add(part, io_ticks, (now - part->stamp));
- }
- part->stamp = now;
-}
-
-/**
- * part_round_stats() - Round off the performance stats on a struct disk_stats.
- * @q: target block queue
- * @part: target partition
- *
- * The average IO queue length and utilisation statistics are maintained
- * by observing the current state of the queue length and the amount of
- * time it has been in this state for.
- *
- * Normally, that accounting is done on IO completion, but that can result
- * in more than a second's worth of IO being accounted for within any one
- * second, leading to >100% utilisation. To deal with that, we call this
- * function to do a round-off before returning the results when reading
- * /proc/diskstats. This accounts immediately for all queue usage up to
- * the current jiffies and restarts the counters again.
- */
-void part_round_stats(struct request_queue *q, struct hd_struct *part)
-{
- struct hd_struct *part2 = NULL;
- unsigned long now = jiffies;
- unsigned int inflight[2];
- int stats = 0;
-
- if (part->stamp != now)
- stats |= 1;
-
- if (part->partno) {
- part2 = &part_to_disk(part)->part0;
- if (part2->stamp != now)
- stats |= 2;
- }
-
- if (!stats)
- return;
-
- part_in_flight(q, part, inflight);
-
- if (stats & 2)
- part_round_stats_single(q, part2, now, inflight[1]);
- if (stats & 1)
- part_round_stats_single(q, part, now, inflight[0]);
-}
-EXPORT_SYMBOL_GPL(part_round_stats);
-
void blk_put_request(struct request *req)
{
blk_mq_free_request(req);
@@ -1383,9 +1327,10 @@ void blk_account_io_done(struct request *req, u64 now)
part_stat_lock();
part = req->part;

+ update_io_ticks(part, jiffies);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
- part_round_stats(req->q, part);
+ part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
part_dec_in_flight(req->q, part, rq_data_dir(req));

hd_struct_put(part);
@@ -1420,11 +1365,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
part = &rq->rq_disk->part0;
hd_struct_get(part);
}
- part_round_stats(rq->q, part);
part_inc_in_flight(rq->q, part, rw);
rq->part = part;
}

+ update_io_ticks(part, jiffies);
+
part_stat_unlock();
}

diff --git a/block/blk-merge.c b/block/blk-merge.c
index a120d59b9705..9da5629d0887 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -689,7 +689,6 @@ static void blk_account_io_merge(struct request *req)
part_stat_lock();
part = req->part;

- part_round_stats(req->q, part);
part_dec_in_flight(req->q, part, rq_data_dir(req));

hd_struct_put(part);
diff --git a/block/genhd.c b/block/genhd.c
index 2fe00cf32b93..cdf174d7d329 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1337,9 +1337,6 @@ static int diskstats_show(struct seq_file *seqf, void *v)

disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
while ((hd = disk_part_iter_next(&piter))) {
- part_stat_lock();
- part_round_stats(gp->queue, hd);
- part_stat_unlock();
part_in_flight(gp->queue, hd, inflight);
seq_printf(seqf, "%4d %7d %s "
"%lu %lu %lu %u "
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7e663cfb1487..42d6138ac876 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -122,9 +122,6 @@ ssize_t part_stat_show(struct device *dev,
struct request_queue *q = part_to_disk(p)->queue;
unsigned int inflight[2];

- part_stat_lock();
- part_round_stats(q, p);
- part_stat_unlock();
part_in_flight(q, p, inflight);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 1677cd2a4c4e..838c2a7a40c5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -398,8 +398,7 @@ static inline void free_part_info(struct hd_struct *part)
kfree(part->info);
}

-/* block/blk-core.c */
-extern void part_round_stats(struct request_queue *q, struct hd_struct *part);
+void update_io_ticks(struct hd_struct *part, unsigned long now);

/* block/genhd.c */
extern void device_add_disk(struct device *parent, struct gendisk *disk,
--
2.18.0
Mike Snitzer
2018-12-05 20:24:28 UTC
Permalink
Now that request-based dm-multipath only supports blk-mq, make use of
the newly introduced blk_mq_queue_busy() to check for outstanding IO --
rather than (ab)using the block core's in_flight counters.

Signed-off-by: Mike Snitzer <***@redhat.com>
---
drivers/md/dm-rq.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1f1fe9a618ea..d2397d8fcbd1 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -130,11 +130,11 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
- atomic_dec(&md->pending[rw]);
-
/* nudge anyone waiting on suspend queue */
- if (!md_in_flight(md))
- wake_up(&md->wait);
+ if (unlikely(waitqueue_active(&md->wait))) {
+ if (!blk_mq_queue_busy(md->queue))
+ wake_up(&md->wait);
+ }

/*
* dm_put() must be at the end of this function. See the comment above
@@ -436,7 +436,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
static void dm_start_request(struct mapped_device *md, struct request *orig)
{
blk_mq_start_request(orig);
- atomic_inc(&md->pending[rq_data_dir(orig)]);

if (unlikely(dm_stats_used(&md->stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);
--
2.18.0
Mike Snitzer
2018-12-05 20:24:29 UTC
Permalink
All of part_stat_* and related methods are used with preempt disabled,
so there is no need to pass cpu around to allow of them. Just call
smp_processor_id() as needed.

Suggested-by: Jens Axboe <***@kernel.dk>
Signed-off-by: Mike Snitzer <***@redhat.com>
---
block/bio.c | 16 +++++++++-------
block/blk-core.c | 34 +++++++++++++++-------------------
block/blk-merge.c | 5 ++---
block/genhd.c | 5 ++---
block/partition-generic.c | 5 ++---
drivers/md/md.c | 7 +++----
include/linux/genhd.h | 26 +++++++++++++-------------
7 files changed, 46 insertions(+), 52 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 03895cc0d74a..91e398ba57f1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1667,11 +1667,12 @@ void generic_start_io_acct(struct request_queue *q, int op,
unsigned long sectors, struct hd_struct *part)
{
const int sgrp = op_stat_group(op);
- int cpu = part_stat_lock();

- part_round_stats(q, cpu, part);
- part_stat_inc(cpu, part, ios[sgrp]);
- part_stat_add(cpu, part, sectors[sgrp], sectors);
+ part_stat_lock();
+
+ part_round_stats(q, part);
+ part_stat_inc(part, ios[sgrp]);
+ part_stat_add(part, sectors[sgrp], sectors);
part_inc_in_flight(q, part, op_is_write(op));

part_stat_unlock();
@@ -1683,10 +1684,11 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
{
unsigned long duration = jiffies - start_time;
const int sgrp = op_stat_group(req_op);
- int cpu = part_stat_lock();

- part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_round_stats(q, cpu, part);
+ part_stat_lock();
+
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_round_stats(q, part);
part_dec_in_flight(q, part, op_is_write(req_op));

part_stat_unlock();
diff --git a/block/blk-core.c b/block/blk-core.c
index ad59102ee30a..734b768c9d9d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -584,14 +584,14 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
}
EXPORT_SYMBOL(blk_get_request);

-static void part_round_stats_single(struct request_queue *q, int cpu,
+static void part_round_stats_single(struct request_queue *q,
struct hd_struct *part, unsigned long now,
unsigned int inflight)
{
if (inflight) {
- __part_stat_add(cpu, part, time_in_queue,
+ __part_stat_add(part, time_in_queue,
inflight * (now - part->stamp));
- __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
+ __part_stat_add(part, io_ticks, (now - part->stamp));
}
part->stamp = now;
}
@@ -599,7 +599,6 @@ static void part_round_stats_single(struct request_queue *q, int cpu,
/**
* part_round_stats() - Round off the performance stats on a struct disk_stats.
* @q: target block queue
- * @cpu: cpu number for stats access
* @part: target partition
*
* The average IO queue length and utilisation statistics are maintained
@@ -613,7 +612,7 @@ static void part_round_stats_single(struct request_queue *q, int cpu,
* /proc/diskstats. This accounts immediately for all queue usage up to
* the current jiffies and restarts the counters again.
*/
-void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
+void part_round_stats(struct request_queue *q, struct hd_struct *part)
{
struct hd_struct *part2 = NULL;
unsigned long now = jiffies;
@@ -635,9 +634,9 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
part_in_flight(q, part, inflight);

if (stats & 2)
- part_round_stats_single(q, cpu, part2, now, inflight[1]);
+ part_round_stats_single(q, part2, now, inflight[1]);
if (stats & 1)
- part_round_stats_single(q, cpu, part, now, inflight[0]);
+ part_round_stats_single(q, part, now, inflight[0]);
}
EXPORT_SYMBOL_GPL(part_round_stats);

@@ -1362,11 +1361,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
if (blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
- int cpu;

- cpu = part_stat_lock();
+ part_stat_lock();
part = req->part;
- part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
+ part_stat_add(part, sectors[sgrp], bytes >> 9);
part_stat_unlock();
}
}
@@ -1381,14 +1379,13 @@ void blk_account_io_done(struct request *req, u64 now)
if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
- int cpu;

- cpu = part_stat_lock();
+ part_stat_lock();
part = req->part;

- part_stat_inc(cpu, part, ios[sgrp]);
- part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns);
- part_round_stats(req->q, cpu, part);
+ part_stat_inc(part, ios[sgrp]);
+ part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
+ part_round_stats(req->q, part);
part_dec_in_flight(req->q, part, rq_data_dir(req));

hd_struct_put(part);
@@ -1400,16 +1397,15 @@ void blk_account_io_start(struct request *rq, bool new_io)
{
struct hd_struct *part;
int rw = rq_data_dir(rq);
- int cpu;

if (!blk_do_io_stat(rq))
return;

- cpu = part_stat_lock();
+ part_stat_lock();

if (!new_io) {
part = rq->part;
- part_stat_inc(cpu, part, merges[rw]);
+ part_stat_inc(part, merges[rw]);
} else {
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
@@ -1424,7 +1420,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
part = &rq->rq_disk->part0;
hd_struct_get(part);
}
- part_round_stats(rq->q, cpu, part);
+ part_round_stats(rq->q, part);
part_inc_in_flight(rq->q, part, rw);
rq->part = part;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4431da69a5cf..a120d59b9705 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -685,12 +685,11 @@ static void blk_account_io_merge(struct request *req)
{
if (blk_do_io_stat(req)) {
struct hd_struct *part;
- int cpu;

- cpu = part_stat_lock();
+ part_stat_lock();
part = req->part;

- part_round_stats(req->q, cpu, part);
+ part_round_stats(req->q, part);
part_dec_in_flight(req->q, part, rq_data_dir(req));

hd_struct_put(part);
diff --git a/block/genhd.c b/block/genhd.c
index 0145bcb0cc76..2fe00cf32b93 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1326,7 +1326,6 @@ static int diskstats_show(struct seq_file *seqf, void *v)
struct hd_struct *hd;
char buf[BDEVNAME_SIZE];
unsigned int inflight[2];
- int cpu;

/*
if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1338,8 +1337,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)

disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
while ((hd = disk_part_iter_next(&piter))) {
- cpu = part_stat_lock();
- part_round_stats(gp->queue, cpu, hd);
+ part_stat_lock();
+ part_round_stats(gp->queue, hd);
part_stat_unlock();
part_in_flight(gp->queue, hd, inflight);
seq_printf(seqf, "%4d %7d %s "
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 5f8db5c5140f..7e663cfb1487 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -121,10 +121,9 @@ ssize_t part_stat_show(struct device *dev,
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q = part_to_disk(p)->queue;
unsigned int inflight[2];
- int cpu;

- cpu = part_stat_lock();
- part_round_stats(q, cpu, p);
+ part_stat_lock();
+ part_round_stats(q, p);
part_stat_unlock();
part_in_flight(q, p, inflight);
return sprintf(buf,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fc488cb30a94..9a0a1e0934d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
- int cpu;

blk_queue_split(q, &bio);

@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)

md_handle_request(mddev, bio);

- cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
+ part_stat_lock();
+ part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();

return BLK_QC_T_NONE;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0c5ee17b4d88..1677cd2a4c4e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -295,8 +295,8 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); })
#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0)

-#define __part_stat_add(cpu, part, field, addnd) \
- (per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd))
+#define __part_stat_add(part, field, addnd) \
+ (per_cpu_ptr((part)->dkstats, smp_processor_id())->field += (addnd))

#define part_stat_read(part, field) \
({ \
@@ -333,7 +333,7 @@ static inline void free_part_stats(struct hd_struct *part)
#define part_stat_lock() ({ rcu_read_lock(); 0; })
#define part_stat_unlock() rcu_read_unlock()

-#define __part_stat_add(cpu, part, field, addnd) \
+#define __part_stat_add(part, field, addnd) \
((part)->dkstats.field += addnd)

#define part_stat_read(part, field) ((part)->dkstats.field)
@@ -362,19 +362,19 @@ static inline void free_part_stats(struct hd_struct *part)
part_stat_read(part, field[STAT_WRITE]) + \
part_stat_read(part, field[STAT_DISCARD]))

-#define part_stat_add(cpu, part, field, addnd) do { \
- __part_stat_add((cpu), (part), field, addnd); \
+#define part_stat_add(part, field, addnd) do { \
+ __part_stat_add((part), field, addnd); \
if ((part)->partno) \
- __part_stat_add((cpu), &part_to_disk((part))->part0, \
+ __part_stat_add(&part_to_disk((part))->part0, \
field, addnd); \
} while (0)

-#define part_stat_dec(cpu, gendiskp, field) \
- part_stat_add(cpu, gendiskp, field, -1)
-#define part_stat_inc(cpu, gendiskp, field) \
- part_stat_add(cpu, gendiskp, field, 1)
-#define part_stat_sub(cpu, gendiskp, field, subnd) \
- part_stat_add(cpu, gendiskp, field, -subnd)
+#define part_stat_dec(gendiskp, field) \
+ part_stat_add(gendiskp, field, -1)
+#define part_stat_inc(gendiskp, field) \
+ part_stat_add(gendiskp, field, 1)
+#define part_stat_sub(gendiskp, field, subnd) \
+ part_stat_add(gendiskp, field, -subnd)

void part_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
@@ -399,7 +399,7 @@ static inline void free_part_info(struct hd_struct *part)
}

/* block/blk-core.c */
-extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part);
+extern void part_round_stats(struct request_queue *q, struct hd_struct *part);

/* block/genhd.c */
extern void device_add_disk(struct device *parent, struct gendisk *disk,
--
2.18.0
Mike Snitzer
2018-12-05 20:24:27 UTC
Permalink
From: Mikulas Patocka <***@redhat.com>

generic_start_io_acct and generic_end_io_acct already update the variable
in_flight using atomic operations, so we don't have to overwrite them
again.

Signed-off-by: Mikulas Patocka <***@redhat.com>
Signed-off-by: Mike Snitzer <***@redhat.com>
---
drivers/md/dm.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a733e4c920af..a8ae7931bce7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -663,8 +663,7 @@ static void start_io_acct(struct dm_io *io)
generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
&dm_disk(md)->part0);

- atomic_set(&dm_disk(md)->part0.in_flight[rw],
- atomic_inc_return(&md->pending[rw]));
+ atomic_inc(&md->pending[rw]);

if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -693,7 +692,6 @@ static void end_io_acct(struct dm_io *io)
* a flush.
*/
pending = atomic_dec_return(&md->pending[rw]);
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
pending += atomic_read(&md->pending[rw^0x1]);

/* nudge anyone waiting on suspend queue */
--
2.18.0
Mike Snitzer
2018-12-05 20:24:31 UTC
Permalink
From: Mikulas Patocka <***@redhat.com>

Now when part_round_stats is gone, we can switch to per-cpu in-flight
counters.

We use the local-atomic type local_t, so that if part_inc_in_flight or
part_dec_in_flight is reentrantly called from an interrupt, the value will
be correct.

The other counters could be corrupted due to reentrant interrupt, but the
corruption only results in slight counter skew - the in_flight counter
must be exact, so it needs local_t.

Signed-off-by: Mikulas Patocka <***@redhat.com>
Signed-off-by: Mike Snitzer <***@redhat.com>
---
block/genhd.c | 49 ++++++++++++++++++++++++++++++++++---------
include/linux/genhd.h | 3 ++-
2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index cdf174d7d329..ffb9d416db64 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -47,51 +47,80 @@ static void disk_release_events(struct gendisk *disk);

void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
{
+ int cpu;
+
if (queue_is_mq(q))
return;

- atomic_inc(&part->in_flight[rw]);
+ cpu = smp_processor_id();
+ local_inc(&per_cpu_ptr(part->dkstats, cpu)->in_flight[rw]);
if (part->partno)
- atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+ local_inc(&per_cpu_ptr(part_to_disk(part)->part0.dkstats, cpu)->in_flight[rw]);
}

void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
{
+ int cpu;
+
if (queue_is_mq(q))
return;

- atomic_dec(&part->in_flight[rw]);
+ cpu = smp_processor_id();
+ local_dec(&per_cpu_ptr(part->dkstats, cpu)->in_flight[rw]);
if (part->partno)
- atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+ local_dec(&per_cpu_ptr(part_to_disk(part)->part0.dkstats, cpu)->in_flight[rw]);
}

void part_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
+ int cpu;
+
if (queue_is_mq(q)) {
blk_mq_in_flight(q, part, inflight);
return;
}

- inflight[0] = atomic_read(&part->in_flight[0]) +
- atomic_read(&part->in_flight[1]);
+ inflight[0] = 0;
+ for_each_possible_cpu(cpu) {
+ inflight[0] += local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[0]) +
+ local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[1]);
+ }
+ if ((int)inflight[0] < 0)
+ inflight[0] = 0;
+
if (part->partno) {
part = &part_to_disk(part)->part0;
- inflight[1] = atomic_read(&part->in_flight[0]) +
- atomic_read(&part->in_flight[1]);
+ inflight[1] = 0;
+ for_each_possible_cpu(cpu) {
+ inflight[1] += local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[0]) +
+ local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[1]);
+ }
+ if ((int)inflight[1] < 0)
+ inflight[1] = 0;
}
}

void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
+ int cpu;
+
if (queue_is_mq(q)) {
blk_mq_in_flight_rw(q, part, inflight);
return;
}

- inflight[0] = atomic_read(&part->in_flight[0]);
- inflight[1] = atomic_read(&part->in_flight[1]);
+ inflight[0] = 0;
+ inflight[1] = 0;
+ for_each_possible_cpu(cpu) {
+ inflight[0] += local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[0]);
+ inflight[1] += local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[1]);
+ }
+ if ((int)inflight[0] < 0)
+ inflight[0] = 0;
+ if ((int)inflight[1] < 0)
+ inflight[1] = 0;
}

struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 838c2a7a40c5..9489dcb7cc27 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -17,6 +17,7 @@
#include <linux/percpu-refcount.h>
#include <linux/uuid.h>
#include <linux/blk_types.h>
+#include <asm/local.h>

#ifdef CONFIG_BLOCK

@@ -89,6 +90,7 @@ struct disk_stats {
unsigned long merges[NR_STAT_GROUPS];
unsigned long io_ticks;
unsigned long time_in_queue;
+ local_t in_flight[2];
};

#define PARTITION_META_INFO_VOLNAMELTH 64
@@ -122,7 +124,6 @@ struct hd_struct {
int make_it_fail;
#endif
unsigned long stamp;
- atomic_t in_flight[2];
#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
--
2.18.0
Mike Snitzer
2018-12-05 20:24:32 UTC
Permalink
From: Mikulas Patocka <***@redhat.com>

The previous patches deleted all the code that needed the second value
returned from part_in_flight - now the kernel only uses the first value.

Consequently, part_in_flight (and blk_mq_in_flight) may be changed so that
it only returns one value.

This patch just refactors the code, there's no functional change.

Signed-off-by: Mikulas Patocka <***@redhat.com>
Signed-off-by: Mike Snitzer <***@redhat.com>
---
block/blk-mq.c | 12 +++++-------
block/blk-mq.h | 3 +--
block/genhd.c | 32 +++++++++++---------------------
block/partition-generic.c | 6 +++---
include/linux/genhd.h | 3 +--
5 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 900550594651..c6d3101352f4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -100,25 +100,23 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
struct mq_inflight *mi = priv;

/*
- * index[0] counts the specific partition that was asked for. index[1]
- * counts the ones that are active on the whole device, so increment
- * that if mi->part is indeed a partition, and not a whole device.
+ * index[0] counts the specific partition that was asked for.
*/
if (rq->part == mi->part)
mi->inflight[0]++;
- if (mi->part->partno)
- mi->inflight[1]++;

return true;
}

-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2])
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
{
+ unsigned inflight[2];
struct mq_inflight mi = { .part = part, .inflight = inflight, };

inflight[0] = inflight[1] = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+
+ return inflight[0];
}

static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index a664ea44ffd4..0c9c9ea2fefe 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -187,8 +187,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}

-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2]);
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);

diff --git a/block/genhd.c b/block/genhd.c
index ffb9d416db64..dd84922192b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -71,34 +71,24 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
local_dec(&per_cpu_ptr(part_to_disk(part)->part0.dkstats, cpu)->in_flight[rw]);
}

-void part_in_flight(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2])
+unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
{
int cpu;
+ int inflight;

if (queue_is_mq(q)) {
- blk_mq_in_flight(q, part, inflight);
- return;
+ return blk_mq_in_flight(q, part);
}

- inflight[0] = 0;
+ inflight = 0;
for_each_possible_cpu(cpu) {
- inflight[0] += local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[0]) +
+ inflight += local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[0]) +
local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[1]);
}
- if ((int)inflight[0] < 0)
- inflight[0] = 0;
+ if (inflight < 0)
+ inflight = 0;

- if (part->partno) {
- part = &part_to_disk(part)->part0;
- inflight[1] = 0;
- for_each_possible_cpu(cpu) {
- inflight[1] += local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[0]) +
- local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[1]);
- }
- if ((int)inflight[1] < 0)
- inflight[1] = 0;
- }
+ return (unsigned int)inflight;
}

void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
@@ -1354,7 +1344,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
struct disk_part_iter piter;
struct hd_struct *hd;
char buf[BDEVNAME_SIZE];
- unsigned int inflight[2];
+ unsigned int inflight;

/*
if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1366,7 +1356,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)

disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
while ((hd = disk_part_iter_next(&piter))) {
- part_in_flight(gp->queue, hd, inflight);
+ inflight = part_in_flight(gp->queue, hd);
seq_printf(seqf, "%4d %7d %s "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
@@ -1382,7 +1372,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
part_stat_read(hd, merges[STAT_WRITE]),
part_stat_read(hd, sectors[STAT_WRITE]),
(unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
- inflight[0],
+ inflight,
jiffies_to_msecs(part_stat_read(hd, io_ticks)),
jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
part_stat_read(hd, ios[STAT_DISCARD]),
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 42d6138ac876..8e596a8dff32 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -120,9 +120,9 @@ ssize_t part_stat_show(struct device *dev,
{
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q = part_to_disk(p)->queue;
- unsigned int inflight[2];
+ unsigned int inflight;

- part_in_flight(q, p, inflight);
+ inflight = part_in_flight(q, p);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -137,7 +137,7 @@ ssize_t part_stat_show(struct device *dev,
part_stat_read(p, merges[STAT_WRITE]),
(unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
(unsigned int)part_stat_read_msecs(p, STAT_WRITE),
- inflight[0],
+ inflight,
jiffies_to_msecs(part_stat_read(p, io_ticks)),
jiffies_to_msecs(part_stat_read(p, time_in_queue)),
part_stat_read(p, ios[STAT_DISCARD]),
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 9489dcb7cc27..669b7fe400d7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -377,8 +377,7 @@ static inline void free_part_stats(struct hd_struct *part)
#define part_stat_sub(gendiskp, field, subnd) \
part_stat_add(gendiskp, field, -subnd)

-void part_in_flight(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2]);
+unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part);
void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
void part_dec_in_flight(struct request_queue *q, struct hd_struct *part,
--
2.18.0
Mike Snitzer
2018-12-05 20:24:33 UTC
Permalink
From: Mikulas Patocka <***@redhat.com>

Remove the "pending" atomic counters, that duplicate block-core's
in_flight counters, and update md_in_flight() to look at percpu
in_flight counters.

Signed-off-by: Mikulas Patocka <***@redhat.com>
Signed-off-by: Mike Snitzer <***@redhat.com>
---
drivers/md/dm-core.h | 2 --
drivers/md/dm.c | 34 +++++++++++++++-------------------
2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 224d44503a06..6fe883fac471 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -65,7 +65,6 @@ struct mapped_device {
*/
struct work_struct work;
wait_queue_head_t wait;
- atomic_t pending[2];
spinlock_t deferred_lock;
struct bio_list deferred;

@@ -119,7 +118,6 @@ struct mapped_device {
struct srcu_struct io_barrier;
};

-int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a8ae7931bce7..ff6e5a5902f2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,25 +646,30 @@ static void free_tio(struct dm_target_io *tio)
bio_put(&tio->clone);
}

-int md_in_flight(struct mapped_device *md)
+static bool md_in_flight(struct mapped_device *md)
{
- return atomic_read(&md->pending[READ]) +
- atomic_read(&md->pending[WRITE]);
+ int cpu;
+ struct hd_struct *part = &dm_disk(md)->part0;
+
+ for_each_possible_cpu(cpu) {
+ if (local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[0]) ||
+ local_read(&per_cpu_ptr(part->dkstats, cpu)->in_flight[1]))
+ return true;
+ }
+
+ return false;
}

static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
- int rw = bio_data_dir(bio);

io->start_time = jiffies;

generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
&dm_disk(md)->part0);

- atomic_inc(&md->pending[rw]);
-
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -676,8 +681,6 @@ static void end_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
- int pending;
- int rw = bio_data_dir(bio);

generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
io->start_time);
@@ -687,16 +690,11 @@ static void end_io_acct(struct dm_io *io)
bio->bi_iter.bi_sector, bio_sectors(bio),
true, duration, &io->stats_aux);

- /*
- * After this is decremented the bio must not be touched if it is
- * a flush.
- */
- pending = atomic_dec_return(&md->pending[rw]);
- pending += atomic_read(&md->pending[rw^0x1]);
-
/* nudge anyone waiting on suspend queue */
- if (!pending)
- wake_up(&md->wait);
+ if (unlikely(waitqueue_active(&md->wait))) {
+ if (!md_in_flight(md))
+ wake_up(&md->wait);
+ }
}

/*
@@ -1904,8 +1902,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->disk)
goto bad;

- atomic_set(&md->pending[0], 0);
- atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
--
2.18.0
Jens Axboe
2018-12-06 02:43:54 UTC
Permalink
Hi,
This v3 adds "block: stop passing 'cpu' to all percpu stats methods"
before switching over to percpu counters.
I gave this a shot, with the hope of being able to use it for blk-mq
as well. The tldr is that it's still slower than the tag iteration,
but that isn't a show stopper for a generic implementation. But
unfortunately it fails for queue size results, iostat just shows
0 for the average queue size.
--
Jens Axboe
Loading...