Discussion:
[dm-devel] [PATCH 3/3] block: use a driver-specific handler for the "inflight" value
Mikulas Patocka
2018-11-16 00:04:19 UTC
Permalink
Device mapper was converted to percpu inflight counters. In order to
display the correct values in the "inflight" sysfs file and in
/proc/diskstats, we need a custom callback that sums the percpu counters.

The function part_round_stats calculates the number of in-flight I/Os
every jiffy and uses this to calculate the counters time_in_queue and
io_ticks. In order to avoid excessive memory traffic on systems with high
number of CPUs, this functionality is disabled when percpu inflight values
are used and the values time_in_queue and io_ticks are calculated
differently - the result is less precise.

We add the duration of an I/O to time_in_queue when the I/O finishes (the
value is almost the same as previously, except for the time of in-flight
I/Os).

If an I/O starts or finishes and the "jiffies" value has changed, we add
one to io_ticks. If the I/Os take less than a jiffy, the value is as exact
as the previous value. If the I/Os take more than a jiffy, the value may
lag behind the previous value.

Signed-off-by: Mikulas Patocka <***@redhat.com>

---
block/blk-core.c | 7 ++++++-
block/blk-settings.c | 6 ++++++
block/genhd.c | 12 ++++++++++++
drivers/md/dm.c | 37 +++++++++++++++++++++++++++++++++++--
include/linux/blkdev.h | 3 +++
5 files changed, 62 insertions(+), 3 deletions(-)

Index: linux-dm/block/genhd.c
===================================================================
--- linux-dm.orig/block/genhd.c 2018-11-15 22:11:51.000000000 +0100
+++ linux-dm/block/genhd.c 2018-11-15 22:11:51.000000000 +0100
@@ -68,6 +68,13 @@ void part_dec_in_flight(struct request_q
void part_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
+ if (q->get_inflight_fn) {
+ q->get_inflight_fn(q, inflight);
+ inflight[0] += inflight[1];
+ inflight[1] = 0;
+ return;
+ }
+
if (q->mq_ops) {
blk_mq_in_flight(q, part, inflight);
return;
@@ -85,6 +92,11 @@ void part_in_flight(struct request_queue
void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
+ if (q->get_inflight_fn) {
+ q->get_inflight_fn(q, inflight);
+ return;
+ }
+
if (q->mq_ops) {
blk_mq_in_flight_rw(q, part, inflight);
return;
Index: linux-dm/include/linux/blkdev.h
===================================================================
--- linux-dm.orig/include/linux/blkdev.h 2018-11-15 22:11:51.000000000 +0100
+++ linux-dm/include/linux/blkdev.h 2018-11-15 22:11:51.000000000 +0100
@@ -286,6 +286,7 @@ struct blk_queue_ctx;

typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
+typedef void (get_inflight_fn)(struct request_queue *, unsigned int [2]);

struct bio_vec;
typedef int (dma_drain_needed_fn)(struct request *);
@@ -405,6 +406,7 @@ struct request_queue {
make_request_fn *make_request_fn;
poll_q_fn *poll_fn;
dma_drain_needed_fn *dma_drain_needed;
+ get_inflight_fn *get_inflight_fn;

const struct blk_mq_ops *mq_ops;

@@ -1099,6 +1101,7 @@ extern void blk_queue_update_dma_alignme
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
+extern void blk_queue_get_inflight(struct request_queue *, get_inflight_fn *);

/*
* Number of physical segments as sent to the device.
Index: linux-dm/block/blk-settings.c
===================================================================
--- linux-dm.orig/block/blk-settings.c 2018-11-15 22:11:51.000000000 +0100
+++ linux-dm/block/blk-settings.c 2018-11-15 22:11:51.000000000 +0100
@@ -849,6 +849,12 @@ void blk_queue_write_cache(struct reques
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);

+void blk_queue_get_inflight(struct request_queue *q, get_inflight_fn *fn)
+{
+ q->get_inflight_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_get_inflight);
+
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
Index: linux-dm/drivers/md/dm.c
===================================================================
--- linux-dm.orig/drivers/md/dm.c 2018-11-15 22:11:51.000000000 +0100
+++ linux-dm/drivers/md/dm.c 2018-11-15 22:18:44.000000000 +0100
@@ -657,18 +657,30 @@ int md_in_flight(struct mapped_device *m
return (int)sum;
}

+static void test_io_ticks(int cpu, struct hd_struct *part, unsigned long now)
+{
+ unsigned long stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
+ __part_stat_add(cpu, part, io_ticks, 1);
+ }
+ }
+}
+
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
+ unsigned long now = jiffies;
struct hd_struct *part;
int sgrp, cpu;

- io->start_time = jiffies;
+ io->start_time = now;

part = &dm_disk(md)->part0;
sgrp = op_stat_group(bio_op(bio));
cpu = part_stat_lock();
+ test_io_ticks(cpu, part, now);
__part_stat_add(cpu, part, ios[sgrp], 1);
__part_stat_add(cpu, part, sectors[sgrp], bio_sectors(bio));
part_stat_unlock();
@@ -685,7 +697,8 @@ static void end_io_acct(struct dm_io *io
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
- unsigned long duration = jiffies - io->start_time;
+ unsigned long now = jiffies;
+ unsigned long duration = now - io->start_time;
struct hd_struct *part;
int sgrp, cpu;

@@ -697,7 +710,9 @@ static void end_io_acct(struct dm_io *io
part = &dm_disk(md)->part0;
sgrp = op_stat_group(bio_op(bio));
cpu = part_stat_lock();
+ test_io_ticks(cpu, part, now);
__part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration));
+ __part_stat_add(cpu, part, time_in_queue, duration);
part_stat_unlock();

smp_wmb();
@@ -711,6 +726,23 @@ static void end_io_acct(struct dm_io *io
}
}

+static void dm_get_inflight(struct request_queue *q, unsigned int inflight[2])
+{
+ struct mapped_device *md = q->queuedata;
+ int cpu;
+
+ inflight[READ] = inflight[WRITE] = 0;
+ for_each_possible_cpu(cpu) {
+ struct dm_percpu *p = per_cpu_ptr(md->counters, cpu);
+ inflight[READ] += p->inflight[READ];
+ inflight[WRITE] += p->inflight[WRITE];
+ }
+ if ((int)inflight[READ] < 0)
+ inflight[READ] = 0;
+ if ((int)inflight[WRITE] < 0)
+ inflight[WRITE] = 0;
+}
+
/*
* Add the bio to the list of deferred io.
*/
@@ -2224,6 +2256,7 @@ int dm_setup_md_queue(struct mapped_devi
case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
+ blk_queue_get_inflight(md->queue, dm_get_inflight);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
Index: linux-dm/block/blk-core.c
===================================================================
--- linux-dm.orig/block/blk-core.c 2018-11-15 22:11:51.000000000 +0100
+++ linux-dm/block/blk-core.c 2018-11-15 22:11:51.000000000 +0100
@@ -695,10 +695,15 @@ static void part_round_stats_single(stru
void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
{
struct hd_struct *part2 = NULL;
- unsigned long now = jiffies;
+ unsigned long now;
unsigned int inflight[2];
int stats = 0;

+ if (q->get_inflight_fn)
+ return;
+
+ now = jiffies;
+
if (part->stamp != now)
stats |= 1;
Christoph Hellwig
2018-11-16 09:11:02 UTC
Permalink
Post by Mikulas Patocka
Device mapper was converted to percpu inflight counters. In order to
display the correct values in the "inflight" sysfs file and in
/proc/diskstats, we need a custom callback that sums the percpu counters.
The function part_round_stats calculates the number of in-flight I/Os
every jiffy and uses this to calculate the counters time_in_queue and
io_ticks. In order to avoid excessive memory traffic on systems with high
number of CPUs, this functionality is disabled when percpu inflight values
are used and the values time_in_queue and io_ticks are calculated
differently - the result is less precise.
And none of that is device mapper specific. Please submit this code
to the block layer for use by all make_request based drivers. Depending
on what Jens as the maintainers thinkgs of the tradeoffs we can discuss
if the summing should be on or off by default, or if we maybe even
need the current code as a fallback.
Mike Snitzer
2018-11-16 13:55:56 UTC
Permalink
On Fri, Nov 16 2018 at 4:11am -0500,
Post by Christoph Hellwig
Post by Mikulas Patocka
Device mapper was converted to percpu inflight counters. In order to
display the correct values in the "inflight" sysfs file and in
/proc/diskstats, we need a custom callback that sums the percpu counters.
The function part_round_stats calculates the number of in-flight I/Os
every jiffy and uses this to calculate the counters time_in_queue and
io_ticks. In order to avoid excessive memory traffic on systems with high
number of CPUs, this functionality is disabled when percpu inflight values
are used and the values time_in_queue and io_ticks are calculated
differently - the result is less precise.
And none of that is device mapper specific. Please submit this code
to the block layer for use by all make_request based drivers. Depending
on what Jens as the maintainers thinkgs of the tradeoffs we can discuss
if the summing should be on or off by default, or if we maybe even
need the current code as a fallback.
I agree.

Mikulas, we discussed that the changes would be made to block core. I
know that makes you less comfortable (I assume because you need to
consider more than DM) but it is the right way forward.

Now that the legacy IO path is gone we have less to consider; these
counters are only impacting bio-based.

Mike
Jens Axboe
2018-11-16 15:25:42 UTC
Permalink
Post by Mike Snitzer
On Fri, Nov 16 2018 at 4:11am -0500,
Post by Christoph Hellwig
Post by Mikulas Patocka
Device mapper was converted to percpu inflight counters. In order to
display the correct values in the "inflight" sysfs file and in
/proc/diskstats, we need a custom callback that sums the percpu counters.
The function part_round_stats calculates the number of in-flight I/Os
every jiffy and uses this to calculate the counters time_in_queue and
io_ticks. In order to avoid excessive memory traffic on systems with high
number of CPUs, this functionality is disabled when percpu inflight values
are used and the values time_in_queue and io_ticks are calculated
differently - the result is less precise.
And none of that is device mapper specific. Please submit this code
to the block layer for use by all make_request based drivers. Depending
on what Jens as the maintainers thinkgs of the tradeoffs we can discuss
if the summing should be on or off by default, or if we maybe even
need the current code as a fallback.
I agree.
Mikulas, we discussed that the changes would be made to block core. I
know that makes you less comfortable (I assume because you need to
consider more than DM) but it is the right way forward.
Now that the legacy IO path is gone we have less to consider; these
counters are only impacting bio-based.
Agree, either the new code is good enough to be used in general, and
then it should be generally used, or it should not exist in the first
place. We've always worked very hard to provide the most efficient
helpers and infrastructure we can in the block layer itself, so that
drivers don't have to reinvent the wheel.
--
Jens Axboe
Mikulas Patocka
2018-11-28 00:41:55 UTC
Permalink
Post by Jens Axboe
Post by Mike Snitzer
On Fri, Nov 16 2018 at 4:11am -0500,
Post by Christoph Hellwig
Post by Mikulas Patocka
Device mapper was converted to percpu inflight counters. In order to
display the correct values in the "inflight" sysfs file and in
/proc/diskstats, we need a custom callback that sums the percpu counters.
The function part_round_stats calculates the number of in-flight I/Os
every jiffy and uses this to calculate the counters time_in_queue and
io_ticks. In order to avoid excessive memory traffic on systems with high
number of CPUs, this functionality is disabled when percpu inflight values
are used and the values time_in_queue and io_ticks are calculated
differently - the result is less precise.
And none of that is device mapper specific. Please submit this code
to the block layer for use by all make_request based drivers. Depending
on what Jens as the maintainers thinkgs of the tradeoffs we can discuss
if the summing should be on or off by default, or if we maybe even
need the current code as a fallback.
I agree.
Mikulas, we discussed that the changes would be made to block core. I
know that makes you less comfortable (I assume because you need to
consider more than DM) but it is the right way forward.
Now that the legacy IO path is gone we have less to consider; these
counters are only impacting bio-based.
Agree, either the new code is good enough to be used in general, and
then it should be generally used, or it should not exist in the first
place. We've always worked very hard to provide the most efficient
helpers and infrastructure we can in the block layer itself, so that
drivers don't have to reinvent the wheel.
--
Jens Axboe
I have generalized the per-cpu changes (so that all bio-based block
devices use per-cpu in_flight counters) and I've made patches for the
for-4.21/block branch in your git. I'm sending them.

Mikulas

Loading...