1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/* per device */
struct ioc {
struct rq_qos rqos;

bool enabled;

struct ioc_params params;
struct ioc_margins margins;
u32 period_us;
u32 timer_slack_ns;
u64 vrate_min;
u64 vrate_max;

spinlock_t lock;
struct timer_list timer;
struct list_head active_iocgs; /* active cgroups */
struct ioc_pcpu_stat __percpu *pcpu_stat;

enum ioc_running running;
atomic64_t vtime_rate;
u64 vtime_base_rate;
s64 vtime_err;

seqcount_spinlock_t period_seqcount;
u64 period_at; /* wallclock starttime */
u64 period_at_vtime; /* vtime starttime */

atomic64_t cur_period; /* inc'd each period */
int busy_level; /* saturation history */

bool weights_updated;
atomic_t hweight_gen; /* for lazy hweights */

/* debt forgivness */
u64 dfgv_period_at;
u64 dfgv_period_rem;
u64 dfgv_usage_us_sum;

u64 autop_too_fast_at;
u64 autop_too_slow_at;
int autop_idx;
bool user_qos_params:1;
bool user_cost_model:1;
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* per device-cgroup pair */
struct ioc_gq {
struct blkg_policy_data pd;
struct ioc *ioc;

/*
* A iocg can get its weight from two sources - an explicit
* per-device-cgroup configuration or the default weight of the
* cgroup. `cfg_weight` is the explicit per-device-cgroup
* configuration. `weight` is the effective considering both
* sources.
*
* When an idle cgroup becomes active its `active` goes from 0 to
* `weight`. `inuse` is the surplus adjusted active weight.
* `active` and `inuse` are used to calculate `hweight_active` and
* `hweight_inuse`.
*
* `last_inuse` remembers `inuse` while an iocg is idle to persist
* surplus adjustments.
*
* `inuse` may be adjusted dynamically during period. `saved_*` are used
* to determine and track adjustments.
*/

u32 cfg_weight;
u32 weight;
u32 active;
u32 inuse;

u32 last_inuse;
s64 saved_margin;

sector_t cursor; /* to detect randio */

/*
* `vtime` is this iocg's vtime cursor which progresses as IOs are
* issued. If lagging behind device vtime, the delta represents
* the currently available IO budget. If running ahead, the
* overage.
*
* `vtime_done` is the same but progressed on completion rather
* than issue. The delta behind `vtime` represents the cost of
* currently in-flight IOs.
*/
atomic64_t vtime;
atomic64_t done_vtime;
u64 abs_vdebt;

/* current delay in effect and when it started */
u64 delay;
u64 delay_at;

/*
* The period this iocg was last active in. Used for deactivation
* and invalidating `vtime`.
*/
atomic64_t active_period;
struct list_head active_list;

/* see __propagate_weights() and current_hweight() for details */
u64 child_active_sum;
u64 child_inuse_sum;
u64 child_adjusted_sum;
int hweight_gen;
u32 hweight_active;
u32 hweight_inuse;
u32 hweight_donating;
u32 hweight_after_donation;

struct list_head walk_list;
struct list_head surplus_list;

struct wait_queue_head waitq;
struct hrtimer waitq_timer;

/* timestamp at the latest activation */
u64 activated_at;

/* statistics */
struct iocg_pcpu_stat __percpu *pcpu_stat;
struct iocg_stat stat;
struct iocg_stat last_stat;
u64 last_stat_abs_vusage;
u64 usage_delta_us;
u64 wait_since;
u64 indebt_since;
u64 indelay_since;

/* this iocg's depth in the hierarchy and ancestors including self */
int level;
struct ioc_gq *ancestors[];
};

Y_4K/y_b = a + b* X_4K / X_b

Y_4k = a* X_4k Y_b / X_b. a= Y_4k * X_b / (X_4ky_b)

Ssd iops: a_8k= 70k8k / (4k51k) = 2.745

a_16k = 70k * 16k / ( 29k * 4k) =

y = a+ bx + cx2

SSD:

<4k:

IOPS = 80000

BPS = 80000 * x

4k-16k:

BPS:

IOPS = y/x = 70M/x + 57k - 2 * x

BPS = 70M + 57k*x - 2x^2

x > 16k:

BPS = BPS_16k

IOPS = BPS / x

NVMe:

<=16k

BPS = 150000 * x

iops = 150000

HDD:

<=16k

BPS = 100 * x

IOPS = 100

270M = a + b * 4k + c * 16M

400M = a + b * 8k + c * 64M

460M = a + b * 16k + c * 256M

130M = 4k * b+ c * 48M

60M = 8k * b + c * 192M

200M = - c * 96M

c= -2.08

b = 57.42k

a = 460M - 918.72M + 532.48M = 74M

(y_4k, 4k).

Question: wa usage

When you run top, you can see the usage of wa. What does it means and how it calculated?

1
%Cpu(s):  0.1 us,  2.4 sy,  0.0 ni, 93.6 id,  3.9 wa,  0.0 hi,  0.0 si,  0.0 st

Values related to processor utilization are displayed on the third line. They provide insight into exactly what the CPUs are doing.

  • us is the percent of time spent running user processes.
  • sy is the percent of time spent running the kernel.
  • ni is the percent of time spent running processes with manually configured nice values.
  • id is the percent of time idle (if high, CPU may be overworked).
  • wa is the percent of wait time (if high, CPU is waiting for I/O access).
  • hi is the percent of time managing hardware interrupts.
  • si is the percent of time managing software interrupts.
  • st is the percent of virtual CPU time waiting for access to physical CPU.

In the context of the top command in Unix-like operating systems, the “wa” field represents the percentage of time the CPU spends waiting for I/O operations to complete. Specifically, it stands for “waiting for I/O.”

The calculation includes the time the CPU is idle while waiting for data to be read from or written to storage devices such as hard drives or SSDs. High values in the “wa” field may indicate that the system is spending a significant amount of time waiting for I/O operations to complete, which could be a bottleneck if not addressed.

The formula for calculating the “wa” value is as follows:

wa %=(Time spent waiting for I/O / Total CPU time)×100

Also we can use vmstat 1 to watch the wa state in system, but it is the not the percent:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
r b swpd free buff cache si so bi bo in cs us sy id wa st
5 9 0 25994684 24 64404172 0 0 1 16 0 0 4 3 93 0 0
0 10 0 25994436 24 64404172 0 0 342776 341104 63906 52162 2 10 83 4 3
0 10 0 25994188 24 64404172 0 0 350392 350968 65593 52742 2 12 80 3 3
1 9 0 25994188 24 64404172 0 0 342656 341128 63793 51367 2 11 80 3 4
0 10 0 25994188 24 64404172 0 0 350760 350216 65439 52811 2 10 81 3 4
5 8 0 25993940 24 64404172 0 0 355224 358952 66762 52421 2 10 80 3 5
0 10 0 25993444 24 64404172 0 0 350536 350576 64998 52346 1 10 81 3 4
1 10 0 25992948 24 64404172 0 0 348344 347912 64709 51990 1 11 81 3 4
2 10 0 25992700 24 64404172 0 0 351944 349616 65443 51371 2 11 80 3 5
0 10 0 26000764 24 64404172 0 0 343368 345000 64589 51969 2 12 80 3 4
1 9 0 26000516 24 64404172 0 0 348976 352352 65800 53079 1 9 84 3 3
1 9 0 26000516 24 64404172 0 0 345520 341800 63737 51454 1 8 84 3 3
1 9 0 26000516 24 64404172 0 0 349192 352336 65719 52241 2 9 82 3 4
1 9 0 26000516 24 64404172 0 0 352384 355696 66567 52232 2 11 80 3 4
8 5 0 26000268 24 64404172 0 0 346368 345496 64541 52114 2 12 80 3 3
0 10 0 26000020 24 64404172 0 0 340064 341296 63603 51670 2 11 81 3 2
0 0 0 26007496 24 64399156 0 0 306736 306040 57770 46495 2 10 82 3 3
0 0 0 26007588 24 64398828 0 0 0 0 853 1480 0 0 100 0 0
0 0 0 26007588 24 64398828 0 0 0 0 871 1553 0 0 100 0 0
0 0 0 26007588 24 64398828 0 0 0 0 748 1430 0 0 100 0 0
0 0 0 26007588 24 64398828 0 0 0 2 731 1399 0 0 100 0 0
0 0 0 26007588 24 64398828 0 0 0 0 695 1319 0 0 100 0 0
0 0 0 26007588 24 64398828 0 0 0 0 743 1418 0 0 100 0 0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
Procs
r: The number of processes waiting for run time.
b: The number of processes in uninterruptible sleep.
Memory
swpd: the amount of virtual memory used.
free: the amount of idle memory.
buff: the amount of memory used as buffers.
cache: the amount of memory used as cache.
inact: the amount of inactive memory. (-a option)
active: the amount of active memory. (-a option)
Swap
si: Amount of memory swapped in from disk (/s).
so: Amount of memory swapped to disk (/s).
IO
bi: Blocks received from a block device (blocks/s).
bo: Blocks sent to a block device (blocks/s).
System
in: The number of interrupts per second, including the clock.
cs: The number of context switches per second.
CPU
These are percentages of total CPU time.
us: Time spent running non-kernel code. (user time, including nice time)
sy: Time spent running kernel code. (system time)
id: Time spent idle. Prior to Linux 2.5.41, this includes IO-wait time.
wa: Time spent waiting for IO. Prior to Linux 2.5.41, included in idle.
st: Time stolen from a virtual machine. Prior to Linux 2.6.11, unknown.

In summary, a high “wa” value in the output of top suggests that your system is spending a considerable amount of time waiting for I/O operations to be completed, which could impact overall system performance. Investigating and optimizing disk I/O can be beneficial in such cases, possibly by improving disk performance, optimizing file systems, or identifying and addressing specific I/O-intensive processes. We can use iotop to find the high io or slow io processes in system:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Current DISK READ:     341.78 M/s | Current DISK WRITE:     340.53 M/s
TID PRIO USER DISK READ DISK WRITE SWAPIN IO> COMMAND
3487445 be/4 root 34.44 M/s 33.36 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487446 be/4 root 33.40 M/s 34.27 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487447 be/4 root 34.10 M/s 33.78 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487448 be/4 root 32.21 M/s 31.64 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487449 be/4 root 34.22 M/s 34.44 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487450 be/4 root 34.46 M/s 34.33 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487451 be/4 root 34.17 M/s 34.21 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487452 be/4 root 33.05 M/s 32.95 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487453 be/4 root 34.23 M/s 34.74 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
3487454 be/4 root 37.51 M/s 36.79 M/s ?unavailable? fio -filename=/mnt/hostroot/var/mnt/testfile -direct=1 -iodepth 64 -thread -rw=randrw -ioengine=libaio -bs=8k -size=20G -numjobs=10 -group_reporting --runtime=100 -name=mytest
1 be/4 root 0.00 B/s 0.00 B/s ?unavailable? systemd --switched-root --system --deserialize 29
2 be/4 root 0.00 B/s 0.00 B/s ?unavailable? [kthreadd]
3 be/4 root 0.00 B/s 0.00 B/s ?unavailable? [rcu_gp]
4 be/4 root 0.00 B/s 0.00 B/s ?unavailable? [rcu_par_gp]
6 be/4 root 0.00 B/s 0.00 B/s ?unavailable? [kworker/0:0H-events_highpri]

Code tracing

From strace the top command, the wa value is get the status from /proc/*/stat. The /proc/*/stat is wrote by the proc ops and get the iowait time from kernel_cpustat.cpustat[CPUTIME_IOWAIT] accroding to the code fs/proc/stat.c.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
static const struct proc_ops stat_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
.proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};

static int __init proc_stat_init(void)
{
proc_create("stat", 0, NULL, &stat_proc_ops);
return 0;
}

static int stat_open(struct inode *inode, struct file *file)
{
unsigned int size = 1024 + 128 * num_online_cpus();

/* minimum size to display an interrupt count : 2 bytes */
size += 2 * nr_irqs;
return single_open_size(file, show_stat, NULL, size);
}

static int show_stat(struct seq_file *p, void *v)
{
...
iowait += get_iowait_time(&kcpustat, i);
...
}

static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;

if (cpu_online(cpu))
iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);

if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;

return iowait;
}

In the Linux kernel, the calculation of the “wa” (waiting for I/O) value that is reported by tools like top is handled within the kernel’s scheduler. The specific code responsible for updating the CPU usage statistics can be found in the kernel/sched/cputime.c file.

One of the key functions related to this is account_idle_time().

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
* Account for idle time.
* @cputime: the CPU time spent in idle wait
*/
void account_idle_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();

if (atomic_read(&rq->nr_iowait) > 0)
cpustat[CPUTIME_IOWAIT] += cputime;
else
cpustat[CPUTIME_IDLE] += cputime;
}

u32 nr_iowait; /* number of blocked threads (waiting for I/O) */


This function is part of the kernel’s scheduler and is responsible for updating the various CPU time statistics, including the time spent waiting for I/O. The function takes into account different CPU states, such as idle time and time spent waiting for I/O.When The blocked threads with waiting for I/O, the cpu time accumulated into CPUTIME_IOWAIT.

Here is a basic outline of how the “wa” time is accounted for in the Linux kernel:

  1. Idle time accounting: The kernel keeps track of the time the CPU spends in an idle state, waiting for tasks to execute.
  2. I/O wait time accounting: When a process is waiting for I/O operations (such as reading or writing to a disk), the kernel accounts for this time in the appropriate CPU state, including the “wa” time.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# block/fops.c
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
unsigned int nr_pages)
{
...
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(dio->waiter))
break;
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
...
}


1
2
3
4
5
6
7
8
9
10
11
# block/blk-core.c
void blk_io_schedule(void)
{
/* Prevent hang_check timer from firing at us during very long I/O */
unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
if (timeout)
io_schedule_timeout(timeout);
else
io_schedule();
}
EXPORT_SYMBOL_GPL(blk_io_schedule);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# kernel/sched/core.c
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
long __sched io_schedule_timeout(long timeout)
{
int token;
long ret;

token = io_schedule_prepare();
ret = schedule_timeout(timeout);
io_schedule_finish(token);

return ret;
}
EXPORT_SYMBOL(io_schedule_timeout);

int io_schedule_prepare(void)
{
int old_iowait = current->in_iowait;

current->in_iowait = 1;
blk_flush_plug(current->plug, true);
return old_iowait;
}

void io_schedule_finish(int token)
{
current->in_iowait = token;
}

asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;

sched_submit_work(tsk);
do {
preempt_disable();
__schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);

static void __sched notrace __schedule(unsigned int sched_mode)
{
...
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
...
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# kernel/time/timer.c
signed long __sched schedule_timeout(signed long timeout)
{
struct process_timer timer;
unsigned long expire;

switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
/*
* These two special cases are useful to be comfortable
* in the caller. Nothing more. We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I' d like to return a valid offset (>=0) to allow
* the caller to do everything it want with the retval.
*/
schedule();
goto out;
...
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
...
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}

wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
...
}

Question: trace the life of IO by blktrace/blkparse

blktrace

blkparse

Investigate the blktrace implementation to understand the trace hook of the blk IO stack?

blktrace to parse the block io

blktrace explores the Linux kernel tracepoint infrastructure to track requests in-flight through the block I/O stack. It traces everything that goes through to block devices, while observing timing information.

First we using dd command to make a write IO (the read IO is the same).

1
2
3
4
5
// write to a file with direct io by dd command
# dd if=/dev/zero of=testfile bs=16k count=1024000 oflag=direct
1024000+0 records in
1024000+0 records out
16777216000 bytes (17 GB, 16 GiB) copied, 111.433 s, 151 MB/s

Usingblktrace and blkparse analize results:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# blktrace -d /dev/vda -o res
# blkparse -i res
device
252,0 10 3 0.000238760 0 C WS 49724200 + 32 [0]
252,0 0 17 0.000261770 3418552 A WS 48720712 + 32 <- (253,0) 48718664
252,0 0 18 0.000262000 3418552 A WS 49724232 + 32 <- (252,3) 48720712
252,0 0 19 0.000262192 3418552 Q WS 49724232 + 32 [dd]
252,0 0 20 0.000263132 3418552 G WS 49724232 + 32 [dd]
252,0 0 21 0.000263335 3418552 P N [dd]
252,0 0 22 0.000263494 3418552 U N [dd] 1
252,0 0 23 0.000263719 3418552 I WS 49724232 + 32 [dd]
252,0 0 24 0.000264296 3418552 D WS 49724232 + 32 [dd]
252,0 10 4 0.000341566 0 C WS 49724232 + 32 [0]
252,0 0 25 0.000366914 3418552 A WS 48720744 + 32 <- (253,0) 48718696
252,0 0 26 0.000367235 3418552 A WS 49724264 + 32 <- (252,3) 48720744
252,0 0 27 0.000367410 3418552 Q WS 49724264 + 32 [dd]
252,0 0 28 0.000368423 3418552 G WS 49724264 + 32 [dd]
252,0 0 29 0.000368612 3418552 P N [dd]
252,0 0 30 0.000368797 3418552 U N [dd] 1
252,0 0 31 0.000369036 3418552 I WS 49724264 + 32 [dd]
252,0 0 32 0.000369730 3418552 D WS 49724264 + 32 [dd]
252,0 10 5 0.000453876 0 C WS 49724264 + 32 [0]
...
252,0 10 91033 2.279081418 3405085 C WS 50399976 + 32 [0]
252,0 10 232460 5.618988454 3405085 C WS 51349952 + 32 [0]
252,0 10 318363 8.424729857 3405085 C WM 39888904 + 16 [0]
252,0 10 318364 8.424732534 3405085 C WM 267886593 + 1 [0]
252,0 10 318365 8.424734981 3405085 C WM 267886600 + 16 [0]
252,0 10 318366 8.424737948 3405085 C WM 267925376 + 32 [0]
252,0 10 318367 8.424750755 3405085 C WM 267924512 + 32 [0]

...
Total (res):
Reads Queued: 0, 0KiB Writes Queued: 70567, 1128MiB
Read Dispatches: 3, 0KiB Write Dispatches: 70559, 1128MiB
Reads Requeued: 0 Writes Requeued: 0
Reads Completed: 3, 0KiB Writes Completed: 70563, 1128MiB
Read Merges: 0, 0KiB Write Merges: 6, 24KiB
IO unplugs: 70545 Timer unplugs: 0


# tracing 267886608
252,0 6 71 8.424115456 585 A WM 267886608 + 8 <- (252,3) 266883088
252,0 6 72 8.424115630 585 Q WM 267886608 + 8 [xfsaild/dm-0]
252,0 6 73 8.424115962 585 M WM 267886608 + 8 [xfsaild/dm-0]

# tracing 4555007
252,0 9 2 8.423915195 3374227 A WFSM 4555007 + 31 <- (252,3) 3551487
252,0 9 3 8.423915869 3374227 Q WFSM 4555007 + 31 [kworker/9:4]
252,0 9 4 8.423918485 3374227 G WFSM 4555007 + 31 [kworker/9:4]
252,0 9 5 8.423927401 240 D WSM 4555007 + 31 [kworker/9:1H]
252,0 10 318350 8.424051685 0 C WSM 4555007 + 31 [0]
252,0 10 318353 8.424169747 0 C WSM 4555007 [0]

![*Screenshot 2023-12-25 at 13.37.09*](/Users/tashen/Library/Application Support/typora-user-images/Screenshot 2023-12-25 at 13.37.09.png)

ACTION IDENTIFIERS

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
The following table shows the various actions which may be
output:

A IO was remapped to a different device

B IO bounced

C IO completion

D IO issued to driver

F IO front merged with request on queue

G Get request

I IO inserted onto request queue

M IO back merged with request on queue

P Plug request

Q IO handled by request queue code

S Sleep request

T Unplug due to timeout

U Unplug request

X Split

The result is long and there only paste few of them. I try to trace an different sectors and behaviors of block layer.

blkparse command still hard to know the total metrics and statistics, so we can use btt to see the statistic of whole table and picture, this is the part of results:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# blkparse -q -i res   -d sda.blk.bin
# btt -i sda.blk.bin

==================== All Devices ====================

ALL MIN AVG MAX N
--------------- ------------- ------------- ------------- -----------

Q2Q 0.000001441 0.000119382 0.802839654 70564
Q2G 0.000000497 0.000000797 0.000025123 70559
G2I 0.000000406 0.000000654 0.000128249 70558
Q2M 0.000000133 0.000000271 0.000000631 6
I2D 0.000000411 0.000000663 0.000017267 70558
M2D 0.000031518 0.000071988 0.000116699 6
D2C 0.000064651 0.000079818 0.002640604 70565
Q2C 0.000066315 0.000081939 0.002643331 70565

==================== Device Overhead ====================

DEV | Q2G G2I Q2M I2D D2C
---------- | --------- --------- --------- --------- ---------
(252, 0) | 0.9723% 0.7983% 0.0000% 0.8096% 97.4121%
---------- | --------- --------- --------- --------- ---------
Overall | 0.9723% 0.7983% 0.0000% 0.8096% 97.4121%

==================== Device Merge Information ====================

DEV | #Q #D Ratio | BLKmin BLKavg BLKmax Total
---------- | -------- -------- ------- | -------- -------- -------- --------
(252, 0) | 70565 70559 1.0 | 1 31 32 2257605
1
2
3
4
5
6
Q------->G------------>I--------->M------------------->D----------------------------->C
|-Q time-|-Insert time-|
|--------- merge time ------------|-merge with other IO|
|----------------scheduler time time-------------------|---driver,adapter,storagetime--|

|----------------------- await time in iostat output ----------------------------------|

Q2Q — time between requests sent to the block layer

Q2G — time from a block I/O is queued to the time it gets a request allocated for it

G2I — time from a request is allocated to the time it is inserted into the device’s queue

Q2M — time from a block I/O is queued to the time it gets merged with an existing request

I2D — time from a request is inserted into the device’s queue to the time it is actually issued to the device (time the I/O is “idle” on the request queue)

M2D — time from a block I/O is merged with an exiting request until the request is issued to the device

D2C — service time of the request by the device (time the I/O is “active” in the driver and on the device)

Q2C — total time spent in the block layer for a request

Actually the blktrace’s implement is using various linux kernel tracepoint to trace different phase of IO. Here are some of the key tracepoints used by blktrace:

  1. block_rq_insert: This tracepoint is hit when a request is inserted into the request queue.(Q)
  2. block_rq_issue: This tracepoint is hit when a request is issued to the device.(I)
  3. block_rq_complete: This tracepoint is hit when a request is completed.(C)
  4. block_bio_queue: This tracepoint is hit when a bio is queued.(Q)
  5. block_bio_backmerge: This tracepoint is hit when a bio is being merged with the last bio in the request.
  6. block_bio_frontmerge: This tracepoint is hit when a bio is being merged with the first bio in the request.
  7. block_bio_bounce: This tracepoint is hit when a bio is bounced.
  8. block_getrq: This tracepoint is hit when get_request() is called to allocate a request.
  9. block_sleeprq: This tracepoint is hit when a process is going to sleep waiting for a request to be available.
  10. block_plug: This tracepoint is hit when a plug is inserted into the request queue.
  11. block_unplug: This tracepoint is hit when a plug is removed from the request queue.

Main data structure in block layer

To deepdiv the block layer, i read the code and finding as followes:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
*/
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
unsigned long bi_rw; /* read or write */
...
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bvec_iter bi_iter;

};

/**
* struct bio_vec - a contiguous range of physical memory addresses
* @bv_page: First page associated with the address range.
* @bv_len: Number of bytes in the address range.
* @bv_offset: Start of the address range relative to the start of @bv_page.
*
* The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len:
*
* nth_page(@bv_page, n) == @bv_page + n
*
* This holds because page_is_mergeable() checks the above property.
*/
struct bio_vec {
struct page *bv_page;
unsigned int bv_len;
unsigned int bv_offset;
};

struct bvec_iter {
sector_t bi_sector; /* device address in 512 byte
sectors */
unsigned int bi_size; /* residual I/O count */

unsigned int bi_idx; /* current index into bvl_vec */

unsigned int bi_bvec_done; /* number of bytes completed in
current bvec */
} __packed;

struct block_device {
sector_t bd_start_sect;
sector_t bd_nr_sectors;
struct gendisk * bd_disk;
struct request_queue * bd_queue;
bool bd_has_submit_bio;
...
} __randomize_layout;

/*
* Try to put the fields that are referenced together in the same cacheline.
*
* If you modify this structure, make sure to update blk_rq_init() and
* especially blk_mq_rq_ctx_init() to take care of the added fields.
*/
struct request {
struct request_queue *q;
struct bio *bio;
struct bio *biotail;
union {
struct list_head queuelist;
struct request *rq_next;
};
enum mq_rq_state state;
/*
* The hash is used inside the scheduler, and killed once the
* request reaches the dispatch list. The ipi_list is only used
* to queue the request for softirq completion, which is long
* after the request has been unhashed (and even removed from
* the dispatch list).
*/
union {
struct hlist_node hash; /* merge hash */
struct llist_node ipi_list;
};

/*
* The rb_node is only used inside the io scheduler, requests
* are pruned when moved to the dispatch queue. So let the
* completion_data share space with the rb_node.
*/
union {
struct rb_node rb_node; /* sort/lookup */
struct bio_vec special_vec;
void *completion_data;
};
/*
* completion callback.
*/
rq_end_io_fn *end_io;
}

struct request_queue {
struct request *last_merge;
struct elevator_queue *elevator;
struct rq_qos *rq_qos;
const struct blk_mq_ops *mq_ops;
struct queue_limits limits;
/*
* for flush operations
*/
struct blk_flush_queue *fq;
}

/**
* struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
* block device
*/
struct blk_mq_hw_ctx {
/**
* @queue: Pointer to the request queue that owns this hardware context.
*/
struct request_queue *queue;
/**
* @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
* decide if the hw_queue is busy using Exponential Weighted Moving
* Average algorithm.
*/
unsigned int dispatch_busy;
}

bio逻辑架构

code flow in block layer

img

Bio -> Request -> plug request list -> staging request queue in sheduler -> hardware request queue

![Screenshot 2023-12-25 at 13.54.30](/Users/tashen/Library/Application Support/typora-user-images/Screenshot 2023-12-25 at 13.54.30.png)

![Screenshot 2023-12-25 at 13.57.23](/Users/tashen/Library/Application Support/typora-user-images/Screenshot 2023-12-25 at 13.57.23.png)

These pictures records the total main code and function flow from the filessystem to block layer and to driver layder. It also prints the tracepoint mapping to the acation in output of the blktrace where they trace from. By these tracepoint, we can understand the block io process flow more clearly.

0%