Kernel modification base on v4.19:
diff --git a/fs/direct-io.c b/fs/direct-io.c
+#include <linux/delay.h>
/**
@@ -305,6 +307,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
* end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
+ printk("%s: before invalidate page \n", __func__);
+ mdelay(10 * 1000);
if (flags & DIO_COMPLETE_INVALIDATE &&
ret > 0 && dio->op == REQ_OP_WRITE &&
dio->inode->i_mapping->nrpages) {
@@ -314,6 +318,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
if (err)
dio_warn_stale_pagecache(dio->iocb->ki_filp);
}
+ printk("%s: end invalidate page \n", __func__);
inode_dio_end(dio->inode);
@@ -371,10 +378,17 @@ static void dio_bio_end_aio(struct bio *bio)
* went in between AIO submission and completion into the
* same region.
*/
+ printk("%s:start to sleep 5s.... \n", __func__);
+ mdelay(10 * 1000);
+ printk("%s: endto sleep, defer_completion=%d op=%d nrpages=%d\n",
+ __func__,dio->defer_completion, dio->op, dio->inode->i_mapping->nrpages);
+
if (dio->result)
defer_completion = dio->defer_completion ||
(dio->op == REQ_OP_WRITE &&
dio->inode->i_mapping->nrpages);
+ printk("%s: dio=%px defer_completion=%d\n",
+ __func__, dio, defer_completion);
if (defer_completion) {
INIT_WORK(&dio->complete_work, dio_aio_complete_work);
queue_work(dio->inode->i_sb->s_dio_done_wq,
Reproduce result :
s=1 -name=test_r_w=write -rwmixread=70 -ioengine=libaio -bs=4k -size=16k -numjob
test_r_w: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=1
fio-3.7
Starting 1 thread
test_r_w: (groupid=0, jobs=1): err= 0: pid=2083: Sun Sep 26 03:18:06 2021
write: IOPS=444, BW=1778KiB/s (1820kB/s)(16.0KiB/9msec)
slat (usec): min=1294, max=3789, avg=2007.88, stdev=1191.07
clat (nsec): min=2304, max=28645, avg=9918.75, stdev=12574.80
lat (usec): min=1298, max=3824, avg=2020.36, stdev=1206.00
clat percentiles (nsec):
| 1.00th=[ 2320], 5.00th=[ 2320], 10.00th=[ 2320], 20.00th=[ 2320],
| 30.00th=[ 2960], 40.00th=[ 2960], 50.00th=[ 2960], 60.00th=[ 5792],
| 70.00th=[ 5792], 80.00th=[28544], 90.00th=[28544], 95.00th=[28544],
| 99.00th=[28544], 99.50th=[28544], 99.90th=[28544], 99.95th=[28544],
| 99.99th=[28544]
lat (usec) : 4=50.00%, 10=25.00%, 50=25.00%
cpu : usr=0.00%, sys=87.50%, ctx=0, majf=0, minf=1
IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=0,4,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=1
Run status group 0 (all jobs):
WRITE: bw=1778KiB/s (1820kB/s), 1778KiB/s-1778KiB/s (1820kB/s-1820kB/s), io=16.0KiB (16.4kB), run=9-9msec
Disk stats (read/write):
sda: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%
=1 -name=test_r_w &write -rwmixread=70 -ioengine=libaio -bs=4k -size=4k -numjobs
[1] 2087
[root@localhost home]# test_r_w: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=1
fio-3.7
Starting 1 thread
[root@localhost home]# [ 1475.286928] ext4_file_write_iter: start lock inode
[ 1475.288807] ext4_file_write_iter: unlock inode
[ 1475.290402] dio_bio_end_aio:start to sleep 5s....
s=1 -name=test_r_w=write -rwmixread=70 -ioengine=libaio -bs=4k -size=16k -numjob
test_r_w: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=1
fio-3.7
Starting 1 thread
Jobs: 1 (f=0): [f(1)][-.-%][r=0KiB/s,w=0KiB/s][r=0,w=0 IOPS][eta 00m:00s]
Jobs: 1 (f=0): [f(1)][-.-%][r=0KiB/s,w=0KiB/s][r=0,w=0 IOPS][eta 00m:00s]
[ 1485.292939] dio_bio_end_aio: endto sleep, defer_completion=0 op=1 nrpages=4
[ 1485.293888] dio_bio_end_aio: dio=ffff88839aff5200 defer_completion=1
[ 1485.294821] dio_aio_complete_work: dio=ffff88839aff5200...
Jobs: 1 (f=1): [W(1)][-.-%][r=0KiB/s,w=0KiB/s][r[ 1485.296539] dio_complete: before invalidate page
[ 1495.298472] Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O! -->detect buffer io and direct io conflict.
[ 1495.302746] File: /home/test/test PID: 493 Comm: kworker/3:2
[ 1495.303508] CPU: 3 PID: 493 Comm: kworker/3:2 Not tainted 4.19.90-dirty #96
[ 1495.304386] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
Jobs: 1 (f=1): [[ 1495.306050] Workqueue: dio/sda dio_aio_complete_work
W(1)][-.-%][r=0K
[ 1495.316879] dio_complete: end invalidate page
test_r_w: (groupid=0, jobs=1): err= 0: pid=2092: Sun Sep 26 03:18:37 2021
write: IOPS=266, BW=1067KiB/s (1092kB/s)(16.0KiB/15msec)
slat (usec): min=2001, max=3101, avg=2691.97, stdev=480.07
clat (nsec): min=2058, max=12337, avg=5999.50, stdev=4456.54
lat (usec): min=2004, max=3116, avg=2700.11, stdev=484.22
clat percentiles (nsec):
| 1.00th=[ 2064], 5.00th=[ 2064], 10.00th=[ 2064], 20.00th=[ 2064],
| 30.00th=[ 4080], 40.00th=[ 4080], 50.00th=[ 4080], 60.00th=[ 5536],
| 70.00th=[ 5536], 80.00th=[12352], 90.00th=[12352], 95.00th=[12352],
| 99.00th=[12352], 99.50th=[12352], 99.90th=[12352], 99.95th=[12352],
| 99.99th=[12352]
lat (usec) : 4=25.00%, 10=50.00%, 20=25.00%
cpu : usr=0.00%, sys=100.00%, ctx=1, majf=0, minf=1
IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=0,4,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=1
Run status group 0 (all jobs):
WRITE: bw=1067KiB/s (1092kB/s), 1067KiB/s-1067KiB/s (1092kB/s-1092kB/s), io=16.0KiB (16.4kB), run=15-15msec
Disk stats (read/write):
sda: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%
test_r_w: (groupid=0, jobs=1): err= 0: pid=2089: Sun Sep 26 03:18:37 2021
write: IOPS=0, BW=204B/s (204B/s)(4096B/20033msec)
slat (nsec): min=3358.7k, max=3358.7k, avg=3358748.00, stdev= 0.00
clat (nsec): min=20027M, max=20027M, avg=20027265355.00, stdev= 0.00
lat (nsec): min=20031M, max=20031M, avg=20030627283.00, stdev= 0.00
clat percentiles (msec):
| 1.00th=[17113], 5.00th=[17113], 10.00th=[17113], 20.00th=[17113],
| 30.00th=[17113], 40.00th=[17113], 50.00th=[17113], 60.00th=[17113],
| 70.00th=[17113], 80.00th=[17113], 90.00th=[17113], 95.00th=[17113],
| 99.00th=[17113], 99.50th=[17113], 99.90th=[17113], 99.95th=[17113],
| 99.99th=[17113]
cpu : usr=0.00%, sys=0.02%, ctx=2, majf=0, minf=1
IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=0,1,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=1
Run status group 0 (all jobs):
WRITE: bw=204B/s (204B/s), 204B/s-204B/s (204B/s-204B/s), io=4096B (4096B), run=20033-20033msec
Disk stats (read/write):
sda: ios=0/5, merge=0/1, ticks=0/10018, in_queue=10015, util=36.98%