最近遇到一例3.10內核的crash:
[ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 4109.683440] fas_readwriter D ffff881fffd147c0 0 46745 43298 0x00000080 [ 4109.683444] ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8 [ 4109.683469] ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20 [ 4109.683487] ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28 [ 4109.683491] Call Trace: [ 4109.683503] [<ffffffff8163f959>] schedule_preempt_disabled+0x29/0x70 [ 4109.683507] [<ffffffff8163d415>] __mutex_lock_slowpath+0xc5/0x1c0 [ 4109.683512] [<ffffffff811eaff7>] ? unlazy_walk+0x87/0x140 [ 4109.683515] [<ffffffff8163c87f>] mutex_lock+0x1f/0x2f [ 4109.683522] [<ffffffff81636235>] lookup_slow+0x33/0xa7 [ 4109.683525] [<ffffffff811f0483>] path_lookupat+0x773/0x7a0 [ 4109.683531] [<ffffffff81274454>] ? SYSC_semtimedop+0x264/0xd10 [ 4109.683537] [<ffffffff810c0cbb>] ? should_numa_migrate_memory+0x5b/0x150 [ 4109.683542] [<ffffffff811c3345>] ? kmem_cache_alloc+0x35/0x1d0 [ 4109.683545] [<ffffffff811f117f>] ? getname_flags+0x4f/0x1a0 [ 4109.683548] [<ffffffff811f04db>] filename_lookup+0x2b/0xc0 [ 4109.683550] [<ffffffff811f22a7>] user_path_at_empty+0x67/0xc0 [ 4109.683556] [<ffffffff811993c7>] ? handle_mm_fault+0x607/0xf40 [ 4109.683559] [<ffffffff811f2311>] user_path_at+0x11/0x20 [ 4109.683564] [<ffffffff811e5b53>] vfs_fstatat+0x63/0xc0 [ 4109.683567] [<ffffffff811e60be>] SYSC_newstat+0x2e/0x60 [ 4109.683570] [<ffffffff81273ced>] ? do_smart_update+0x11d/0x130 [ 4109.683575] [<ffffffff8101c899>] ? read_tsc+0x9/0x10 [ 4109.683580] [<ffffffff8110d336>] ? __audit_syscall_exit+0x1e6/0x280 [ 4109.683583] [<ffffffff811e639e>] SyS_newstat+0xe/0x10 [ 4109.683586] [<ffffffff81649909>] system_call_fastpath+0x16/0x1b
看來在一個mutex上處於uninterrupt同步等待超過了時間。
看堆棧是stat調用訪問文件的元數據,
crash> struct nameidata ffff881f320bfdc0 struct nameidata { path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 }, last = { { { hash = 3495247374, len = 27 }, hash_len = 119459364366 }, name = 0xffff881040c56031 "00fe_0002_011712b3_00000001"-----------訪問的文件名 }, root = { mnt = 0xffff883fccff1020, dentry = 0xffff881fcc7ff980 }, inode = 0xffff881571308e78, flags = 1, seq = 2, last_type = 0, depth = 0, saved_names = {0x7f30b11923e8 <Address 0x7f30b11923e8 out of bounds>, 0xffff881f8a6a2c40 "g", 0xffff881f46cc0438 "", 0xffff881fce793e80 "\260\334\341W\037\210\377\377\310\354\242o\037\210\377\377\020E-D\037\210\377\377\200\231\001\201\377\377\377\377\020\363\031\201\377\377\377\377", 0x80 <Address 0x80 out of bounds>, 0xffff881f320bfec0 "\260\004|\a(\177", 0xffffffff811993c7 <handle_mm_fault+1543> "A\211\302\351\327\375\377\377\220I\211\370L\211\341H\211\332L\211\356L\211\367D\211M\300\350\310]\003", 0x5cd06a0 <Address 0x5cd06a0 out of bounds>, 0x0} }
files 看不到這個文件,說明該進程沒有open 這個file。
要找到對應的mutex:
crash> struct nameidata.path ffff881f320bfdc0 path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 } crash> dentry.d_inode 0xffff881f502552c0 d_inode = 0xffff881571308e78 crash> struct -x dentry.d_inode struct dentry { [0x30] struct inode *d_inode; } crash> inode.i_mutex 0xffff881571308e78 i_mutex = { count = { counter = -1 }, wait_lock = { { rlock = { raw_lock = { { head_tail = 100009462, tickets = { head = 1526, tail = 1526 } } } } } }, wait_list = { next = 0xffff8819fc5b3c90, prev = 0xffff881f5097bc40 }, owner = 0xffff881f52764500,-------spinlock在沒開啟debug的情況下不會設置owner,而mutex一般看owner就知道誰持有 { osq = 0x0, __UNIQUE_ID_rh_kabi_hide1 = { spin_mlock = 0x0 }, {<No data fields>} } }
然后看一下owner為啥拿了互斥量不放:
task 0xffff881f52764500 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" struct task_struct { state = 2, stack = 0xffff881a5c19c000,
查看它的堆棧:
crash> bt -f 47270 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" #0 [ffff881a5c19f7f0] __schedule at ffffffff8163df9b ffff881a5c19f7f8: 0000000000000082 ffff881a5c19ffd8 ffff881a5c19f808: ffff881a5c19ffd8 ffff881a5c19ffd8 ffff881a5c19f818: 00000000000147c0 ffff881fd3267300 ffff881a5c19f828: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f838: 7fffffffffffffff ffff881f52764500 ffff881a5c19f848: ffff881fd042b500 ffff881a5c19f860 ffff881a5c19f858: ffffffff8163e879 #1 [ffff881a5c19f858] schedule at ffffffff8163e879 ffff881a5c19f860: ffff881a5c19f908 ffffffff8163c329 #2 [ffff881a5c19f868] schedule_timeout at ffffffff8163c329 ffff881a5c19f870: 7fffffffffffffff ffff881a5c19f900 ffff881a5c19f880: 0000000000000000 0000000000000000 ffff881a5c19f890: 0000000000000000 ffff8813ee889980 ffff881a5c19f8a0: ffff881a5c19f8b8 ffffffff812cea44 ffff881a5c19f8b0: 0000000000000001 ffff881a5c19f968 ffff881a5c19f8c0: ffffffffa07a51c4 000010005c19f8d8 ffff881a5c19f8d0: 0000000000000000 00000000edb89915 ffff881a5c19f8e0: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f8f0: 7fffffffffffffff ffff881f52764500 ffff881a5c19f900: ffff881fd042b500 ffff881a5c19f968 ffff881a5c19f910: ffffffff8163ec46 #3 [ffff881a5c19f910] wait_for_completion at ffffffff8163ec46 ffff881a5c19f918: 0000000000000001 ffff881f52764500 ffff881a5c19f928: ffffffff810b9930 ffff8813ee889a68 ffff881a5c19f938: ffff8813ee889a68 00000000edb89915 ffff881a5c19f948: ffff8813ee889980 0000000000000001 ffff881a5c19f958: ffffffffa07a6ce3 0000000000000000 ffff881a5c19f968: ffff881a5c19f990 ffffffffa07a6b74 #4 [ffff881a5c19f970] xfs_buf_submit_wait at ffffffffa07a6b74 [xfs]---------同步的buffer io提交函數 ffff881a5c19f978: ffff8813ee889980 0000000000000001-----------ffff8813ee889980 為xfs_buf ffff881a5c19f988: ffffffffa07d3829 ffff881a5c19f9a0 ffff881a5c19f998: ffffffffa07a6ce3 #5 [ffff881a5c19f998] _xfs_buf_read at ffffffffa07a6ce3 [xfs] ffff881a5c19f9a0: ffff881a5c19f9e0 ffffffffa07a6dda #6 [ffff881a5c19f9a8] xfs_buf_read_map at ffffffffa07a6dda [xfs]----------,走這個分支說明tp為NULL,根據map找xfs_buf,找不到則申請一個,扇區為1960485944,長度為8 ffff881a5c19f9b0: ffffffffa07e6cc0 0000000000000000 ffff881a5c19f9c0: 0000000000000000 ffff883f001f7800 ffff881a5c19f9d0: ffff881a5c19fa60 ffff881fd042b500 ffff881a5c19f9e0: ffff881a5c19fa20 ffffffffa07d3829 #7 [ffff881a5c19f9e8] xfs_trans_read_buf_map at ffffffffa07d3829 [xfs]----根據trans和xfs_buf_map調用xfs_buf_read_map 來獲取xfs_buf ffff881a5c19f9f0: 00000000edb89915 ffff881a5c19fb10 ffff881a5c19fa00: 0000000000000000 ffff881571308cc0 ffff881a5c19fa10: ffffffffa07e6cc0 ffff881a5c19fa60 ffff881a5c19fa20: ffff881a5c19faa0 ffffffffa0786204 #8 [ffff881a5c19fa28] xfs_da_read_buf at ffffffffa0786204 [xfs] ffff881a5c19fa30: ffff881a5c19fa50 ffffffffa07e6cc0 ffff881a5c19fa40: 0000000000000000 00000001a07e6650 ffff881a5c19fa50: 0000000000000000 ffff881a5c19fa60 ffff881a5c19fa60: 0000000074daa438 ffff881a00000008 ffff881a5c19fa70: 00000000edb89915 0000000000000000 ffff881a5c19fa80: ffff881a5c19fb10 ffff8810cfbd1b00 ffff881a5c19fa90: 0000000000000016 000000000000006c ffff881a5c19faa0: ffff881a5c19fad0 ffffffffa078baa6 #9 [ffff881a5c19faa8] xfs_dir3_data_read at ffffffffa078baa6 [xfs] ffff881a5c19fab0: ffffffffa07e6cc0 ffff881f5a13ad00 ffff881a5c19fac0: ffff881f5a13ad00 ffff8814905453a0 ffff881a5c19fad0: ffff881a5c19fb58 ffffffffa078f99e #10 [ffff881a5c19fad8] xfs_dir2_leafn_lookup_for_entry at ffffffffa078f99e [xfs] ffff881a5c19fae0: 0000000172bc1800 ffff881f5a13ad58 ffff881a5c19faf0: ffff883f001f7800 ffff881571308cc0 ffff881a5c19fb00: 0000000000000000 ffff881400000016 ffff881a5c19fb10: 0000000000000000 0080001d00800010 ffff881a5c19fb20: ffff000501873dff 00000000edb89915 ffff881a5c19fb30: ffff881f5a13ad40 ffff881f5a13ad00 ffff881a5c19fb40: ffff881f5a13ad10 ffff8810cfbd1b00 ffff881a5c19fb50: ffff881f5a13ad58 ffff881a5c19fb68 ffff881a5c19fb60: ffffffffa0791177 #11 [ffff881a5c19fb60] xfs_dir2_leafn_lookup_int at ffffffffa0791177 [xfs] ffff881a5c19fb68: ffff881a5c19fbc8 ffffffffa0787726 #12 [ffff881a5c19fb70] xfs_da3_node_lookup_int at ffffffffa0787726 [xfs] ffff881a5c19fb78: ffff881a5c19fbdc ffff881a5c19fcb8 ffff881a5c19fb88: 0000000000000000 ffff000100643ebe ffff881a5c19fb98: 00000000edb89915 ffff8810cfbd1b00 ffff881a5c19fba8: 0000000000000000 ffff8810cfbd1b00 ffff881a5c19fbb8: ffff881f5a13ad00 ffff881a5c19fcb8 ffff881a5c19fbc8: ffff881a5c19fc08 ffffffffa0791ded #13 [ffff881a5c19fbd0] xfs_dir2_node_lookup at ffffffffa0791ded [xfs] ffff881a5c19fbd8: ffff881a5c19fc08 00000000edb89915 ffff881a5c19fbe8: ffff8810cfbd1b00 0000000000000000 ffff881a5c19fbf8: 0000000000000000 0000000000000000 ffff881a5c19fc08: ffff881a5c19fc58 ffffffffa07897b5 #14 [ffff881a5c19fc10] xfs_dir_lookup at ffffffffa07897b5 [xfs] ffff881a5c19fc18: ffff881a5c19fc68 0000000000000008 ffff881a5c19fc28: 00000000edb89915 0000000000000008 ffff881a5c19fc38: ffff881571308cc0 ffff881a5c19fcb8 ffff881a5c19fc48: 0000000000000000 ffff881a5c19fcb0 ffff881a5c19fc58: ffff881a5c19fca0 ffffffffa07b7bb6 #15 [ffff881a5c19fc60] xfs_lookup at ffffffffa07b7bb6 [xfs] ffff881a5c19fc68: ffff881a5c19fc90 00000000edb89915 ffff881a5c19fc78: ffff881fcd1038c0 0000000000008000 ffff881a5c19fc88: ffff881a5c19fdf0 ffff881f502552c0 ffff881a5c19fc98: ffff881a5c19fe50 ffff881a5c19fcd8 ffff881a5c19fca8: ffffffffa07b4a0b #16 [ffff881a5c19fca8] xfs_vn_lookup at ffffffffa07b4a0b [xfs] ffff881a5c19fcb0: ffff881a5c19fe60 ffff881fcd1038f8 ffff881a5c19fcc0: 000000000000001b 00000000edb89915 ffff881a5c19fcd0: ffff881fcd1038c0 ffff881a5c19fcf8 ffff881a5c19fce0: ffffffff811eac3d #17 [ffff881a5c19fce0] lookup_real at ffffffff811eac3d ffff881a5c19fce8: ffff881f502552c0 ffff881a5c19ff28 ffff881a5c19fcf8: ffff881a5c19fda8 ffffffff811ee813 #18 [ffff881a5c19fd00] do_last at ffffffff811ee813---------------do_last-->lookup_open-->lookup_real,對應2905行 ffff881a5c19fd08: ffff881a5c19ff58 ffffffff81274abd ffff881a5c19fd18: 0000000000000000 ffff881f52764500 ffff881a5c19fd28: 0000001f52764500 0000000000000000 ffff881a5c19fd38: ffff881571308e78 ffff88041076d000------------ ffff881571308e78對應的inode ffff881a5c19fd48: ffff881a5c19fde4 ffff881f502552c0-------------父目錄的dentry ffff881a5c19fd58: ffff880aed4d5700 ffff880000000024 ffff881a5c19fd68: 01ff880aed4d5700 ffff881a5c19fd80 ffff881a5c19fd78: 00000000edb89915 ffff881a5c19fe50 ffff881a5c19fd88: ffff880aed4d5700 ffff88041076d000 ffff881a5c19fd98: ffff881a5c19ff28 ffff881f52764500 ffff881a5c19fda8: ffff881a5c19fe40 ffffffff811f0be2 #19 [ffff881a5c19fdb0] path_openat at ffffffff811f0be2 ffff881a5c19fdb8: ffffea00515463b0 0000000000000000 ffff881a5c19fdc8: 0000000000000200 ffff883fc04c7080 ffff881a5c19fdd8: 0000004100000000 00000000000000a9 ffff881a5c19fde8: 0000000000000000 00007f2c5676d000 ffff881a5c19fdf8: ffffea00d1d8c2c0 00000000edb89915 ffff881a5c19fe08: 00007f2c5676dff0 00000000edb89915 ffff881a5c19fe18: 00000000ffffff9c ffff88041076d000 ffff881a5c19fe28: ffff881a5c19ff28 0000000000000001 ffff881a5c19fe38: 00000000000036cc ffff881a5c19ff10 ffff881a5c19fe48: ffffffff811f23ab #20 [ffff881a5c19fe48] do_filp_open at ffffffff811f23ab ffff881a5c19fe50: ffff883fcb7e2820 ffff881f502552c0 ffff881a5c19fe60: 0000001bc5e44c0c ffff88041076d031 ffff881a5c19fe70: ffff883fccff1020 ffff881fcc7ff980 ffff881a5c19fe80: ffff881571308e78 0000000200000101 ffff881a5c19fe90: 0000000000000000 0000000000000001 ffff881a5c19fea0: 00007f2c64217510 0000000000000000 ffff881a5c19feb0: 0000000000000000 ffff881a5c19ff00 ffff881a5c19fec0: ffffffff811fef47 ffff883fc175c3c0 ffff881a5c19fed0: 0010000000008000 0000000000000001 ffff881a5c19fee0: 0000000000008000 00000000edb89915 ffff881a5c19fef0: 0000000000000001 00000000000000e3 ffff881a5c19ff00: 00000000ffffff9c ffff88041076d000 ffff881a5c19ff10: ffff881a5c19ff68 ffffffff811dfd53 #21 [ffff881a5c19ff18] do_sys_open at ffffffff811dfd53 ffff881a5c19ff20: ffff881f49292a48 ffff000000008000 ffff881a5c19ff30: 0000010000000024 00000000edb89915 ffff881a5c19ff40: 0000000000000000 00007f2c64217664 ffff881a5c19ff50: 0000000000000002 00007f2c64217653 ffff881a5c19ff60: 00000000000036cc ffff881a5c19ff78 ffff881a5c19ff70: ffffffff811dfe6e #22 [ffff881a5c19ff70] sys_open at ffffffff811dfe6e ffff881a5c19ff78: 00007f2c64217510 ffffffff81649909 #23 [ffff881a5c19ff80] system_call_fastpath at ffffffff81649909 RIP: 00007f2e31a3dc3d RSP: 00007f2c64217358 RFLAGS: 00010202 RAX: 0000000000000002 RBX: ffffffff81649909 RCX: 0000000000010000 RDX: 0000000000000005 RSI: 0000000000000000 RDI: 00007f2c64217510 RBP: 00007f2c64217510 R8: 000000000000ffff R9: 000000000000001f R10: 00007f2e30df7000 R11: 0000000000000293 R12: ffffffff811dfe6e R13: ffff881a5c19ff78 R14: 00000000000036cc R15: 00007f2c64217653 ORIG_RAX: 0000000000000002 CS: 0033 SS: 002b crash>
該進程在一個完成量上等待:
等待的完成量: crash>completion ffff8813ee889a58 struct completion { done = 0,----------------至今未完成,如果完成則為1,由complete 函數修改,並喚醒等待的task wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }
這個完成量其實就是在等待一個io的完成,
對應的調用鏈:
xfs_buf_read_map--》_xfs_buf_read--》xfs_buf_submit_wait-->_xfs_buf_ioapply-->xfs_buf_ioapply_map-->submit_bio
_xfs_buf_ioapply返回后,執行wait_for_completion,然后就一直等着了
查看對應的xfs_buf:
crash> xfs_buf 0xffff8813ee889980 struct xfs_buf { b_rbnode = { __rb_parent_color = 18446612139147634816, rb_right = 0x0, rb_left = 0x0 }, b_bn = 1960485944,----------扇區號 b_length = 8, b_hold = { counter = 2 }, b_lru_ref = { counter = 1 }, b_flags = 1048577,也就是0x100001,io未完成,沒有置為異步標志:#define XBF_ASYNC (1 << 4),#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */,說明需要等待的同步io b_sema = { lock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } }, count = 0, wait_list = { next = 0xffff8813ee8899b8, prev = 0xffff8813ee8899b8 } }, b_lru = { next = 0xffff8813ee8899c8, prev = 0xffff8813ee8899c8 }, b_lru_flags = 0, b_lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, b_io_error = 0, b_waiters = { lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, task_list = { next = 0xffff8813ee8899f0, prev = 0xffff8813ee8899f0 } }, b_list = { next = 0xffff8813ee889a00, prev = 0xffff8813ee889a00 }, b_pag = 0xffff881f4b98db00,-------這個xfs_buf關聯的ag b_target = 0xffff881fd042b500, b_addr = 0xffff880750046000, b_ioend_work = {----------------------這個是io結束之后,會用到的work_struct data = { counter = 0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, b_ioend_wq = 0xffff881f5ab7c400,-----------ioend之后,初始化對應的b_ioend_work,然后掛到這個workqueue中去 b_iodone = 0x0,-----------------------為NULL, b_iowait = {---------------這個就是完成量的地址 done = 0, wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2,------------這把鎖曾經吃過一次虧,記憶深刻, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }, b_fspriv = 0x0, b_transp = 0x0,----------------關聯的xfs_trans,傳入的tans為NULL b_pages = 0xffff8813ee889a90, b_page_array = {0xffffea001d401180, 0x0}, b_maps = 0xffff8813ee889aa8, __b_map = { bm_bn = 1960485944,----------扇區,由於一個ag是管理個扇區,所以這個扇區對應的是ag1 bm_len = 8------長度 }, b_map_count = 1, b_io_length = 8, b_pin_count = { counter = 0 }, b_io_remaining = { counter = 1------------依然為1 }, b_page_count = 1, b_offset = 0, b_error = 0, b_ops = 0xffffffffa07e6cc0 <xfs_dir3_data_buf_ops>---xfs_buf負責的不同對象,有不同的操作指針
在io正常完成時,回調的 xfs_buf_ioend 會處理xfs_buf的io結果,
if (bp->b_iodone)------我們的buf不滿足
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)----我們的buf也不滿足
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);----我們的buf該走的流程
xfs_buf是由ag來緩存管理的,xfs使用一顆radix樹來管理ag,我們的扇區號除以8就是block編號,每個ag管理的block數是固定的,那么很容易獲取到對應的index。
crash> xfs_perag 0xffff881f4b98db00 struct xfs_perag { pag_mount = 0xffff883f001f7800, pag_agno = 1,-------------------------和上面對得上 pag_ref = { counter = 1002 }, pagf_init = 1 '\001', pagi_init = 1 '\001', pagf_metadata = 0 '\000',----是否和元數據關聯 pagi_inodeok = 1 '\001', pagf_levels = "\001\001", pagf_flcount = 4, pagf_freeblks = 112266782,------空閑block pagf_longest = 112265313,----最長的空閑塊 pagf_btreeblks = 0, pagi_freecount = 39,--------空閑inode個數 pagi_count = 175488,--------已經分配的inode個數 pagl_pagino = 81, pagl_leftrec = 1855879528, pagl_rightrec = 76512, pagb_lock = { { rlock = { raw_lock = { { head_tail = 4587590, tickets = { head = 70, tail = 70 } } } } } }, pagb_tree = { rb_node = 0x0 }, pagf_fstrms = { counter = 0 }, pag_ici_lock = { { rlock = { raw_lock = { { head_tail = 65799148, tickets = { head = 1004, tail = 1004 } } } } } }, pag_ici_root = { height = 5, gfp_mask = 32, rnode = 0xffff88144b2c7909 }, pag_ici_reclaimable = 0, pag_ici_reclaim_lock = { count = { counter = 1 }, wait_lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, wait_list = { next = 0xffff881f4b98db70, prev = 0xffff881f4b98db70 }, owner = 0x0, { osq = 0x0, __UNIQUE_ID_rh_kabi_hide1 = { spin_mlock = 0x0 }, {<No data fields>} } }, pag_ici_reclaim_cursor = 0, pag_buf_lock = { { rlock = { raw_lock = { { head_tail = 593896294, tickets = { head = 9062, tail = 9062 } } } } } }, pag_buf_tree = { rb_node = 0xffff881576c18180-----------管理xfs_buf的紅黑樹 }, callback_head = { next = 0x0, func = 0x0 }, pagb_count = 0 }
ag使用紅黑樹來管理xfs_buf,我進一步確認下我找的xfs_buf在紅黑樹中:
tree -t rbtree -o xfs_buf.b_rbnode ffff881f4b98dba0 -s xfs_buf.b_bn,b_length ffff881576c18180 b_bn = 2021058616 b_length = 8 ffff881571630d80 b_bn = 1975870104 b_length = 8 ffff88144a547180 b_bn = 1959990760 b_length = 8 ffff881452e61800 b_bn = 1957446840 b_length = 8 ffff883ed4be4a80 b_bn = 1953586592 b_length = 8 ffff883e829a3000 b_bn = 1953509312 b_length = 8 ffff881577099080 b_bn = 1953509298 b_length = 1 ffff8817d2cd1200 b_bn = 1953509297 b_length = 1 ffff883e829a3300 b_bn = 1953509299 b_length = 1 ffff883e85d85680 b_bn = 1953509304 b_length = 8 ffff88144e985680 b_bn = 1953509352 b_length = 8 ffff883e829a2880 b_bn = 1953509320 b_length = 8 ffff883f3752f780 b_bn = 1953509344 b_length = 8 ffff883ed266a700 b_bn = 1953585224 b_length = 8 。。。。 ffff883ececa1b00 b_bn = 1960485840 b_length = 32 ffff8813ee889980---------------我們出問題的xfs_buf b_bn = 1960485944------------我們出問題的起始扇區和長度 b_length = 8 ffff883e71c6c000 b_bn = 1960486072 b_length = 8 。。。。
說明xfs_buf是正常的,那么xfs_buf下發的io請求去哪了呢?我在這個地方因為找錯了block_device,發現該block_device中的request_queue的io數為0,陷入了迷茫,后來其他同事發現
找錯了設備,我找的是sdae,其實出錯的是sde。
根據文件路徑,可以查找對應的掛載點對應的設備的request_queue:
block_device.bd_queue 0xffff883fc1048d00 bd_queue = 0xffff883fc1e28828 crash> request_queue 0xffff883fc1e28828 struct request_queue { queue_head = { next = 0xffff883fc1e28828,-----------prev和next都指向自己,說明為空,也就是目前該隊列因為調度關系沒有io下發的原因,目前沒有待下發給驅動執行的io prev = 0xffff883fc1e28828 }, last_merge = 0xffff883e826fcd80, elevator = 0xffff883e86875000,------------------這個里面有cfqd nr_rqs = {0, 17},--------------積壓了17個同步io nr_rqs_elvpriv = 17,-----------有17個io需要經過調度器,這個值最大就和nr_rqs的總數相同
request_fn = 0xffffffff81422f70 <scsi_request_fn>,---這個函數會將request從request queue中取出來執行
make_request_fn = 0xffffffff812ce690 <blk_queue_bio>,
prep_rq_fn = 0xffffffff814213c0 <scsi_prep_fn>,
merge_bvec_fn = 0x0,
softirq_done_fn = 0xffffffff81422e20 <scsi_softirq_done>,
rq_timed_out_fn = 0xffffffff8141ec00 <scsi_times_out>,-------當隊列中的request超時之后被調用,
dma_drain_needed = 0x0,
lld_busy_fn = 0xffffffff81421ad0 <scsi_lld_busy>,
。。。
nr_sorted = 11,-----ELEVATOR_INSERT_SORT 方式加入到隊列中的數量
in_flight = {0, 5},----有5個in_flight的同步io沒有返回,其實有6個io request,但其中一個不是REQ_TYPE_FS,所以不會統計在 in_flight中
rq_timeout = 30000,--------1個request 30ms的超時時間,會在blk_add_timer中賦值給 req->timeout
timeout_list = {------------管理所有下發給驅動的request的超時
next = 0xffff883e9d3f7ed0,
prev = 0xffff883e9d24ecd0
},
我們目前知道下發io的xfs_buf,但是bio是動態申請的,怎么知道我們當前的這個io下發了多長時間呢?
在 xfs_buf_ioapply_map函數中,bio的申請如下:
static void xfs_buf_ioapply_map( struct xfs_buf *bp, int map, int *buf_offset, int *count, int rw) { 。。。 bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; 。。。}
可以看出,bio的bi_private成員就是我們的xfs_buf,而且bi_sector成員就是我們的扇區號,既然我們知道扇區號,可以通過search 扇區的方式找到對應的bio為 0xffff881490a76000
request為 ffff8813ee88b900:
crash> struct bio.bi_sector,bi_private,bi_bdev 0xffff881490a76000 bi_sector = 1960485944 bi_private = 0xffff8813ee889980 bi_bdev = 0xffff883fc1048d00 crash> request.start_time,__sector,bio ffff8813ee88b900 start_time = 4298780289----------io生成時的jiffies __sector = 1960485944 bio = 0xffff881490a76000
crash> p jiffies
jiffies = $2 = 4298781359--------------當前的jiffies
可以看出,我們io的生成時間和當前的jiffies相差很小,也就是1秒鍾左右,該進程雖然拿到了鎖,但是它並不是持有了1200s,而很大的可能是它經過喚醒,搶到了鎖,然后提交了io,繼續等待io完成,正常情況下。它算幸運的,因為不幸運的 46745 進程因為等鎖已經1200s了,被khungtaskd 內核線程 選中而觸發了hung_task_panic 。
不管怎么樣,不幸的 46745進程等待mutex已經超過了閾值,有可能是某個進程一直拿鎖而不放,也有可能它排在等待隊列的后面,鎖雖然釋放但也沒輪到它。mutex本不應該這么慢,
是什么導致了這么慢呢?
crash> cfq_data.active_queue 0xffff883e86877c00 active_queue = 0xffff883e9efc03a0 crash> cfq_queue.dispatch_start 0xffff883e9efc03a0 dispatch_start = 4298776243 crash> p jiffies jiffies = $2 = 4298781359 crash> p 4298781359-4298776243 $3 = 5116
可以看出,當前active的cfq_queue 距離調度發起的時候已經過去5秒鍾。而正常來說,一個request設置的超時默認為30s。
查看一下request_queue中的hash成員管理的request ,在rq_mergeable 的情況下,request的hash成員是歸request queue中的hash管理的。
elevator_queue.hash 0xffff883e86875000 hash = {{ 。。。。。 }, { first = 0xffff883e826fdb80 }, { first = 0xffff883e40708800 }, { first = 0x0 }, { first = 0x0 }, { first = 0xffff8812ae0c0980 }, { 。。。。。 }, { first = 0xffff8812c96efb00 }, { first = 0x0 }, { first = 0x0 }, { first = 0xffff8813ee88b980 }, { 。。。。。 }, { first = 0xffff88014a437800 }, { first = 0x0 }, { first = 0xffff880149d39280 }, { first = 0xffff883e4341e300 }, { 。。。。。 }, { first = 0xffff883e826fce00 }, { 。。。。。
通過一個個查看,確定只有9個io,而 q->nr_sorted為11個,說明有2個io是不能merge的,再加上inflight的 5個,也就是16個io,那為什么request queue中alloced了17個io呢。
那是因為,有一個io沒有計入在 in_flight 中。
9個未執行的可以merge的io: crash> request.start_time 0xffff883e826fdb00 start_time = 4298776286 crash> request.start_time 0xffff883e40708780 start_time = 4298780265 crash> request.start_time 0xffff8812ae0c0900 start_time = 4298776286 crash> request.start_time 0xffff8812c96efa80 start_time = 4298780264 crash> request.start_time 0xffff8813ee88b900 start_time = 4298780289 crash> request.start_time 0xffff88014a437780 start_time = 4298780258 crash> request.start_time 0xffff880149d39200 start_time = 4298780285 crash> request.start_time 0xffff883e4341e280 start_time = 4298780285 crash> request.start_time 0xffff883e826fcd80 start_time = 4298776272 2個不能merge的io:
request.hash ffff883e40708c00
hash = {
next = 0x0,
pprev = 0x0
request.hash ffff8812ae0c0780
hash = {
next = 0x0,
pprev = 0x0
}
那么等待執行的6個io怎么獲取呢?在request_queue的elevator中,我們看到它有10個cfq_queue等待調度,
crash> elevator_queue.elevator_data 0xffff883e86875000 elevator_data = 0xffff883e86877c00 crash> cfq_data 0xffff883e86877c00 struct cfq_data { queue = 0xffff883fc1e28828, grp_service_tree = { rb = { rb_node = 0xffff883f36a09820 }, left = 0xffff883f36a09820, count = 4294964171, min_vdisktime = 4707409920, ttime = { last_end_request = 4295195837, ttime_total = 0, ttime_samples = 0, ttime_mean = 0 } }, root_group = 0xffff883f36a09800, serving_wl_class = BE_WORKLOAD, serving_wl_type = SYNC_WORKLOAD, workload_expires = 4298776270,---------jiffies = $1 = 4298781359,cfqd->workload_expires = jiffies + slice; serving_group = 0xffff883f36a09800,----根據這個當前服務的group,可以遍歷其管理的所有cfq_queue prio_trees = {{ rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0xffff881490a87250 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }}, busy_queues = 10,-------------有10個queue等待調度 busy_sync_queues = 10, rq_in_driver = 5,----還沒完成的io數量 rq_in_flight = {0, 5}, rq_queued = 11,-------有11個io在cfq_data中,也就是各個cfqq中的rq_queued中的總和 hw_tag = 1, hw_tag_est_depth = 32, hw_tag_samples = 51, idle_slice_timer = { entry = { next = 0x0, prev = 0xdead000000200200 }, expires = 4298772352, base = 0xffff881fd2fc8000, function = 0xffffffff812ef710 <cfq_idle_slice_timer>, data = 18446612400859216896, slack = -1, start_pid = -1, start_site = 0x0, start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, unplug_work = { data = { counter = 32 }, entry = { next = 0xffff883e86877d28, prev = 0xffff883e86877d28 }, func = 0xffffffff812edef0 <cfq_kick_queue> }, active_queue = 0xffff883e9efc03a0,---------當前active的queue active_cic = 0xffff883e47904f00, async_cfqq = {{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, {0x0, 0x0, 0x0, 0x0, 0xffff883f1f7c9878, 0x0, 0x0, 0x0}},---根據io優先級的異步cfq_queue async_idle_cfqq = 0x0, last_position = 5943375904, cfq_quantum = 8,-----static const int cfq_quantum = 8; cfq_fifo_expire = {250, 125},---static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; cfq_back_penalty = 2,-----static const int cfq_back_penalty = 2; cfq_back_max = 16384,-----static const int cfq_back_max = 16 * 1024; cfq_slice = {40, 100},---static int cfq_slice_async = HZ / 25;static const int cfq_slice_sync = HZ / 10; cfq_slice_async_rq = 2,---static const int cfq_slice_async_rq = 2; cfq_slice_idle = 8,----static int cfq_slice_idle = HZ / 125; cfq_group_idle = 8,----static int cfq_group_idle = HZ / 125; cfq_latency = 1,------ cfqd->cfq_latency = 1; cfq_target_latency = 300,---static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ oom_cfqq = { ref = 1, flags = 64, cfqd = 0xffff883e86877c00, rb_node = { __rb_parent_color = 18446612400859217440, rb_right = 0x0, rb_left = 0x0 }, rb_key = 0, p_node = { __rb_parent_color = 18446612400859217472, rb_right = 0x0, rb_left = 0x0 }, p_root = 0x0, sort_list = { rb_node = 0x0 }, next_rq = 0x0, queued = {0, 0}, allocated = {0, 0}, fifo = { next = 0xffff883e86877e80, prev = 0xffff883e86877e80 }, dispatch_start = 0, allocated_slice = 0, slice_dispatch = 0, slice_start = 0, slice_end = 0, slice_resid = 0, prio_pending = 0, dispatched = 0, ioprio = 0, org_ioprio = 0, ioprio_class = 0, pid = 1, seek_history = 0, last_request_pos = 0, service_tree = 0x0, new_cfqq = 0x0, cfqg = 0xffff883f36a09800, nr_sectors = 0 }, last_delayed_sync = 4298780320 }
先找到當前的serving_group,
serving_group = 0xffff883f36a09800,
根據這個cfq_group,
crash> struct -xo cfq_group.service_trees 0xffff883f36a09800 struct cfq_group { [ffff883f36a09878] struct cfq_rb_root service_trees[2][3]; } crash> cfq_group.dispatched 0xffff883f36a09800 dispatched = 5 ----這個dispatched和 cfqd->rq_in_flight是相同的,對的上
根據當前的io類別:
serving_wl_class = BE_WORKLOAD,
serving_wl_type = SYNC_WORKLOAD,
可以確定它的 service_trees 的數組下標:
enum wl_type_t {---列下標 ASYNC_WORKLOAD = 0, SYNC_NOIDLE_WORKLOAD = 1, SYNC_WORKLOAD = 2 }; enum wl_class_t {----行下標 BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, CFQ_PRIO_NR, };
找到對應的下標為[0][1],[0][2]。
總共10個queue crash> tree -t rb_tree 0xffff883f36a098f8 ffff883f061a0920 ffff883e6fb8c3b0 ffff883e9efc03b0 ffff881490a87230 ffff883e71bcdee0 ffff8801c8b61df8 ffff881577ee1c28 ffff88018a4577a0 crash> tree -t rb_tree 0xffff883f36a098b8 ffff88144e961318 ffff883ea2021400
也就找到了當前group調度的cfq_queue,遍歷各個cfq_queue,就知道對應的request了:
crash> cfq_queue.rb_node struct cfq_queue { [16] struct rb_node rb_node; } 由於cfq_queue通過rb_node成員嵌入到對應cfq_group的service_tree數組中對應class和類型的紅黑樹去,所以可以查看對應的cfq_queue為: 只有11個io,2個dispatch的 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883f061a0910 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} request=ffff88014a437780 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e6fb8c3a0------相當於有一個在request_queue中或者到driver層了,有一個還未被dispatch allocated = {2, 0} ref = 4 dispatched = 1-------------它的dispatch為1 prio_pending = 0 queued = {0, 1} request=ffff883e826fcd80 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e9efc03a0---當前active的queue,無request了 allocated = {1, 0} ref = 2 dispatched = 1-------------它的dispatch為1, prio_pending = 0 queued = {0, 0} crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881490a87220 allocated = {2, 0} ref = 3 dispatched = 0 prio_pending = 0 queued = {0, 2} ffff8812ae0c0780----不在hash鏈中 request.hash ffff8812ae0c0780 hash = { next = 0x0, pprev = 0x0 } ffff8812ae0c0900 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e71bcded0 allocated = {2, 0} ref = 3 dispatched = 0 prio_pending = 0 queued = {0, 2} ffff883e40708780 ffff883e40708c00---這個io不在hash中 crash> request.hash ffff883e40708c00 hash = { next = 0x0, pprev = 0x0 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff8801c8b61de8 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff8812c96efa80 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881577ee1c18 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff883e4341e280 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88018a457790 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff8813ee88b900 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88144e961308 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff883e826fdb00 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883ea20213f0 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff880149d39200
其中,所以dispatch的io,可以在request_queue中的timeout_list 中找到:
crash> struct -xo request_queue.timeout_list 0xffff883fc1e28828 struct request_queue { [ffff883fc1e28dc0] struct list_head timeout_list; } crash> list request.timeout_list -s request.__sector,queuelist,cmd_type,bio,start_time -H ffff883fc1e28dc0 ffff883e9d3f7d80 __sector = 4090189560 queuelist = { next = 0xffff883e9d3f7d80, prev = 0xffff883e9d3f7d80 } cmd_type = REQ_TYPE_FS bio = 0xffff883ebb2f2200 start_time = 4298772344 ffff8804a8440d80 __sector = 4151431904 queuelist = { next = 0xffff8804a8440d80, prev = 0xffff8804a8440d80 } cmd_type = REQ_TYPE_FS bio = 0xffff88029f7a9f00 start_time = 4298768456-----當前jiffies是 4298781359,相當於已經過去23s,不過這個req的scsi_cmnd的 jiffies_at_alloc = 4298774713, ffff8802924f8a80-------------------------------------這個io是dfs下發的ioctl __sector = 18446744073709551615 queuelist = { next = 0xffff8802924f8a80, prev = 0xffff8802924f8a80 } cmd_type = REQ_TYPE_BLOCK_PC bio = 0x0 start_time = 4298774713 ffff883e73374480 __sector = 5860805840 queuelist = { next = 0xffff883e73374480, prev = 0xffff883e73374480 } cmd_type = REQ_TYPE_FS bio = 0xffff883ea412bd00 start_time = 4298776230 ffff883ed1c4ea00 __sector = 5977690392 queuelist = { next = 0xffff883ed1c4ea00, prev = 0xffff883ed1c4ea00 } cmd_type = REQ_TYPE_FS bio = 0xffff883f1e7b6300 start_time = 4298776243 ffff883e9d24eb80 __sector = 5943375896 queuelist = { next = 0xffff883e9d24eb80, prev = 0xffff883e9d24eb80 } cmd_type = REQ_TYPE_FS bio = 0xffff883f1ae6ec00 start_time = 4298776241 crash>
6個io,加上11個未下發到request_queue中的io,為17個,全部找到了,通過上面的遍歷,對io的管理也都熟悉了,此為后話。
其中最久的那個io,jiffies_at_alloc = 4298774713,相比於jiffies也是過去6s了。fs下來的io,默認30s的定時器。
我們在回過頭來看我們的log:
[ 3272.940387] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 3272.940392] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor] [ 3272.940395] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error [ 3272.940397] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00 [ 3272.940399] blk_update_request: critical medium error, dev sde, sector 8 [ 3272.942369] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8 [ 3278.783813] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 3278.783823] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor] [ 3278.783826] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error [ 3278.783830] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00 [ 3278.783832] blk_update_request: critical medium error, dev sde, sector 8 [ 3278.785946] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8 [ 3820.434234] umount:start to umount target /run/user/989 . [ 3820.434239] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs [ 3820.434287] umount:finish umount target :/run/user/989 ,retval:0 . [ 4000.258607] umount:start to umount target /run/user/989 . [ 4000.258613] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs [ 4000.258661] umount:finish umount target :/run/user/989 ,retval:0 . [ 4109.680685] INFO: task fas_readwriter:46745 blocked for more than 1200 seconds. [ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 4109.683440] fas_readwriter D ffff881fffd147c0 0 46745 43298 0x00000080 [ 4109.683444] ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8 [ 4109.683469] ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20 [ 4109.683487] ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28 [ 4109.683491] Call Trace:
在觸發crash之前,前面一直打印 訪問8號扇區出錯,對應的調度鏈是什么呢?
scsi_finish_command-->scsi_io_completion case ACTION_FAIL: -->scsi_print_result -->scsi_print_sense -->scsi_print_command -->scsi_end_request---->blk_update_request
從現象來看,綜合日志中的delay打印:
read file dee46cc8be5ab43982c75a835768773e delay 3511 ms, readsize(581795) count(581795) offset(0)>
推測是由於訪問硬盤慢導致了io的積壓,可能是因為壞塊的重試?因為查看該硬盤的host的eh線程,是空閑的,同時查看對應的scsi_device的reset記錄,也為0,也就是從sense的角度說,
硬盤只是慢,不是沒有響應,錯誤的io次數統計倒是有:
vendor = 0xffff883fc1e60b48 "SEAGATE ST4000NM0025 N003ZC18ASFP"-----------sde iorequest_cnt = { counter = 4641 }, iodone_cnt = { counter = 4635 }, ioerr_cnt = { counter = 283 },
現在從request的角度,我們知道io慢,比如前面看到的有一個23s的request,都快超時了,那到底是什么原因導致平均持鎖的時間過長導致有進程排隊太久呢?由於crash也只是一個快照,從錯誤數量的統計看,雖然從scsi命令看重試次數為0,但是可能硬盤底層對於error會進行重試?暫時無法確認。比較悲劇的是,也沒看到abort的打印,畢竟超時都會伴隨着abort才對。在另外一個crash中,我就看到了對應的abort記錄,由於兩個crash類似,所以推測也是由於超時導致了進程排隊太久。同一個lun,另外一個crash的打印。
[10408827.934895] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881ebda7b480) [10408828.922082] sd 5:0:4:0: attempting task abort! scmd(ffff881e8f3e39c0) [10408828.923772] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e8f3e39c0) [10408828.923785] sd 5:0:4:0: attempting task abort! scmd(ffff881e8f3e16c0) [10408828.925213] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e8f3e16c0) [10409912.753509] sd 5:0:4:0: attempting task abort! scmd(ffff881e5c0fef40) [10409912.754995] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e5c0fef40) [10409912.755017] sd 5:0:4:0: attempting task abort! scmd(ffff881e5c0ffb80) [10409912.756580] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e5c0ffb80)
對應的堆棧:
[10410980.277185] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [10410980.278502] nginx D ffff881fffc547c0 0 48133 45181 0x00000080 [10410980.278506] ffff881fac9c7c70 0000000000000086 ffff881fac9c7fd8 ffff881fac9c7fd8 [10410980.278513] ffff881fac9c7fd8 00000000000147c0 ffff881fd3245080 ffff881e95d1cee0 [10410980.278517] ffff881e95d1cee4 ffff881fc5c28b80 00000000ffffffff ffff881e95d1cee8 [10410980.278521] Call Trace: [10410980.278531] [<ffffffff8163f959>] schedule_preempt_disabled+0x29/0x70 [10410980.278534] [<ffffffff8163d415>] __mutex_lock_slowpath+0xc5/0x1c0 [10410980.278537] [<ffffffff8163c87f>] mutex_lock+0x1f/0x2f [10410980.278541] [<ffffffff811edf1f>] do_last+0x28f/0x1270 [10410980.278546] [<ffffffff81274abd>] ? SYSC_semtimedop+0x8cd/0xd10 [10410980.278549] [<ffffffff811f0be2>] path_openat+0xc2/0x490 [10410980.278554] [<ffffffff811f23ab>] do_filp_open+0x4b/0xb0 [10410980.278570] [<ffffffff811fef47>] ? __alloc_fd+0xa7/0x130 [10410980.278574] [<ffffffff811dfd53>] do_sys_open+0xf3/0x1f0 [10410980.278576] [<ffffffff811dfe6e>] SyS_open+0x1e/0x20 [10410980.278579] [<ffffffff81649909>] system_call_fastpath+0x16/0x1b
對應等的mutex:
crash> bt -f 48133 PID: 48133 TASK: ffff881fc5c28b80 CPU: 2 COMMAND: "nginx" #0 [ffff881fac9c7c10] __schedule at ffffffff8163df9b ffff881fac9c7c18: 0000000000000086 ffff881fac9c7fd8 ffff881fac9c7c28: ffff881fac9c7fd8 ffff881fac9c7fd8 ffff881fac9c7c38: 00000000000147c0 ffff881fd3245080 ffff881fac9c7c48: ffff881e95d1cee0 ffff881e95d1cee4 ffff881fac9c7c58: ffff881fc5c28b80 00000000ffffffff ffff881fac9c7c68: ffff881e95d1cee8 ffff881fac9c7c80 ffff881fac9c7c78: ffffffff8163f959 #1 [ffff881fac9c7c78] schedule_preempt_disabled at ffffffff8163f959 ffff881fac9c7c80: ffff881fac9c7ce0 ffffffff8163d415 #2 [ffff881fac9c7c88] __mutex_lock_slowpath at ffffffff8163d415 ffff881fac9c7c90: ffff881f78483c90 ffff883ef4567c90 ffff881fac9c7ca0: ffff881fc5c28b80 00000000e4ff06ea ffff881fac9c7cb0: 00000001ac9c7cc0 ffff881e95d1cee0---對應的mutex ffff881fac9c7cc0: 0000000000008000 ffff881fac9c7df0 ffff881fac9c7cd0: 0000000000000001 ffff881fac9c7e50 ffff881fac9c7ce0: ffff881fac9c7cf8 ffffffff8163c87f #3 [ffff881fac9c7ce8] mutex_lock at ffffffff8163c87f ffff881fac9c7cf0: ffff881fac9c7f28 ffff881fac9c7da8
mutex對應的owner:
struct mutex.owner ffff881e95d1cee0 owner = 0xffff881ea0d30000
對應的pid:
crash> task 0xffff881ea0d30000 PID: 25156 TASK: ffff881ea0d30000 CPU: 5 COMMAND: "fas_duper"
owner對應的堆棧:
crash> bt -f 25156 PID: 25156 TASK: ffff881ea0d30000 CPU: 5 COMMAND: "fas_duper" #0 [ffff881ea0d3b840] __schedule at ffffffff8163df9b ffff881ea0d3b848: 0000000000000082 ffff881ea0d3bfd8 ffff881ea0d3b858: ffff881ea0d3bfd8 ffff881ea0d3bfd8 ffff881ea0d3b868: 00000000000147c0 ffff881fd3247300 ffff881ea0d3b878: ffff881eed49f0d8 ffff881eed49f0e0 ffff881ea0d3b888: 7fffffffffffffff ffff881ea0d30000 ffff881ea0d3b898: ffff881fd29e1740 ffff881ea0d3b8b0 ffff881ea0d3b8a8: ffffffff8163e879 #1 [ffff881ea0d3b8a8] schedule at ffffffff8163e879 ffff881ea0d3b8b0: ffff881ea0d3b958 ffffffff8163c329 #2 [ffff881ea0d3b8b8] schedule_timeout at ffffffff8163c329 ffff881ea0d3b8c0: 7fffffffffffffff ffff881ea0d3b950 ffff881ea0d3b8d0: 0000000000000000 0000000000000000 ffff881ea0d3b8e0: 0000000000000000 ffff881eed49f000 ffff881ea0d3b8f0: ffff881ea0d3b908 ffffffff812cea44 ffff881ea0d3b900: 0000000000000004 ffff881ea0d3b9b8 ffff881ea0d3b910: ffffffffa07fe1c4 00004000a0d3b928 ffff881ea0d3b920: 0000000000000000 0000000033cf16ff ffff881ea0d3b930: ffff881eed49f0d8 ffff881eed49f0e0 ffff881ea0d3b940: 7fffffffffffffff ffff881ea0d30000 ffff881ea0d3b950: ffff881fd29e1740 ffff881ea0d3b9b8 ffff881ea0d3b960: ffffffff8163ec46 #3 [ffff881ea0d3b960] wait_for_completion at ffffffff8163ec46 ffff881ea0d3b968: 0000000000000001 ffff881ea0d30000 ffff881ea0d3b978: ffffffff810b9930 ffff881eed49f0e8 ffff881ea0d3b988: ffff881eed49f0e8 0000000033cf16ff ffff881ea0d3b998: ffff881eed49f000 0000000000020001 ffff881ea0d3b9a8: ffffffffa07ffce3 0000000000020000 ffff881ea0d3b9b8: ffff881ea0d3b9e0 ffffffffa07ffb74 #4 [ffff881ea0d3b9c0] xfs_buf_submit_wait at ffffffffa07ffb74 [xfs] ffff881ea0d3b9c8: ffff881eed49f000 0000000000020001 ffff881ea0d3b9d8: ffffffffa082c829 ffff881ea0d3b9f0 ffff881ea0d3b9e8: ffffffffa07ffce3 #5 [ffff881ea0d3b9e8] _xfs_buf_read at ffffffffa07ffce3 [xfs] ffff881ea0d3b9f0: ffff881ea0d3ba30 ffffffffa07ffdda #6 [ffff881ea0d3b9f8] xfs_buf_read_map at ffffffffa07ffdda [xfs] ffff881ea0d3ba00: ffffffffa083ff70 0000000000000000 ffff881ea0d3ba10: ffff881e962a5000 ffff881e962a5000 ffff881ea0d3ba20: ffff881ea0d3ba98 ffff881fd29e1740 ffff881ea0d3ba30: ffff881ea0d3ba70 ffffffffa082c829 #7 [ffff881ea0d3ba38] xfs_trans_read_buf_map at ffffffffa082c829 [xfs] ffff881ea0d3ba40: 00020000a0d3ba50 ffff883fafef1dc0 ffff881ea0d3ba50: ffff881e962a5000 ffff883fafef1de8 ffff881ea0d3ba60: ffff881ea0d3bb08 ffff881ea0d3bb00 ffff881ea0d3ba70: ffff881ea0d3bad8 ffffffffa07f3b3e #8 [ffff881ea0d3ba78] xfs_imap_to_bp at ffffffffa07f3b3e [xfs] ffff881ea0d3ba80: ffff881ea0d3ba90 ffffffffa083ff70 ffff881ea0d3ba90: 0000000000000000 00000000e91fb540 ffff881ea0d3baa0: 0000000000000020 0000000033cf16ff ffff881ea0d3bab0: ffff883fafef1dc0 ffff881e962a5000 ffff881ea0d3bac0: 0000000000000000 0000000000000000 ffff881ea0d3bad0: ffff883fafef1de8 ffff881ea0d3bb40 ffff881ea0d3bae0: ffffffffa07f403e #9 [ffff881ea0d3bae0] xfs_iread at ffffffffa07f403e [xfs] ffff881ea0d3bae8: ffff881e00000000 ffffffffa081d1b7 ffff881ea0d3baf8: ffff883fafef1dc0 ffff881e962a5000 ffff881ea0d3bb08: 00000001003f55e2 0000000033cf16ff ffff881ea0d3bb18: ffff883fafef1dc0 ffff881e962a5000 ffff881ea0d3bb28: 0000000000000000 00000001003f55e2 ffff881ea0d3bb38: 000000000000001f ffff881ea0d3bbd8 ffff881ea0d3bb48: ffffffffa0808158 #10 [ffff881ea0d3bb48] xfs_iget at ffffffffa0808158 [xfs] ffff881ea0d3bb50: 0000000000015500 003f55e2811c2a2d ffff881ea0d3bb60: ffff881ea0d3bc30 0000000000015500 ffff881ea0d3bb70: ffff881eb69b75c0 0000000000015500 ffff881ea0d3bb80: 0000000000000000 ffff881e962a5000 ffff881ea0d3bb90: 0000000000000000 ffff881ea0d3bbe8 ffff881ea0d3bba0: 00000000003f55e2 ffff881eb69b7610 ffff881ea0d3bbb0: 0000000000000008 ffff881e95d1cc80 ffff881ea0d3bbc0: 0000000000000000 0000000000000000 ffff881ea0d3bbd0: ffff881ea0d3bc30 ffff881ea0d3bc20 ffff881ea0d3bbe0: ffffffffa0810c37 #11 [ffff881ea0d3bbe0] xfs_lookup at ffffffffa0810c37 [xfs] ffff881ea0d3bbe8: 00000001003f55e2 0000000033cf16ff ffff881ea0d3bbf8: ffff881fc30c9140 ffff881e95d14c00 ffff881ea0d3bc08: ffff881e95d14c00 00000000ffffff9c ffff881ea0d3bc18: ffff881ea0d3bef0 ffff881ea0d3bc58 ffff881ea0d3bc28: ffffffffa080da0b #12 [ffff881ea0d3bc28] xfs_vn_lookup at ffffffffa080da0b [xfs] ffff881ea0d3bc30: ffff881ea0d3bdd0 ffff881fc30c9178 ffff881ea0d3bc40: 000000000000001b 0000000033cf16ff ffff881ea0d3bc50: ffff881fc30c9140 ffff881ea0d3bc78 ffff881ea0d3bc60: ffffffff811eac3d #13 [ffff881ea0d3bc60] lookup_real at ffffffff811eac3d ffff881ea0d3bc68: 00000000ffffff9c 0000000000000001 ffff881ea0d3bc78: ffff881ea0d3bca8 ffffffff811eb5b2 #14 [ffff881ea0d3bc80] __lookup_hash at ffffffff811eb5b2 ffff881ea0d3bc88: 01ff881ea0d3bef0 0000000033cf16ff ffff881ea0d3bc98: ffff881ea0d3bdc0 ffff881ea0d3bd28 ffff881ea0d3bca8: ffff881ea0d3bce0 ffffffff81636244 #15 [ffff881ea0d3bcb0] lookup_slow at ffffffff81636244 ffff881ea0d3bcb8: 0000000000000000 0000000000000001 ffff881ea0d3bcc8: 0000000000000041 ffff881ea0d3bdc0 ffff881ea0d3bcd8: 00000000ffffff9c ffff881ea0d3bd78 ffff881ea0d3bce8: ffffffff811f0483 #16 [ffff881ea0d3bce8] path_lookupat at ffffffff811f0483 ffff881ea0d3bcf0: ffff881ea0d3bda8 ffff881ea0d3bd60 ffff881ea0d3bd00: 0000000000000004 ffff881ea0d3bf58 ffff881ea0d3bd10: 0000000000000000 ffff881ea0d3bf68 ffff881ea0d3bd20: ffffffff812748eb ffff881e00000001 ffff881ea0d3bd30: 0000000000000000 ffffffff811c3345 ffff881ea0d3bd40: ffffffff811f117f 0000000033cf16ff ffff881ea0d3bd50: ffff881ea0d3bdc0 ffff881e7b38b000 ffff881ea0d3bd60: 0000000000000001 00000000ffffff9c ffff881ea0d3bd70: ffff881ea0d3bef0 ffff881ea0d3bdb0 ffff881ea0d3bd80: ffffffff811f04db #17 [ffff881ea0d3bd80] filename_lookup at ffffffff811f04db ffff881ea0d3bd88: 00007fa6c6f00980 ffff881e7b38b000 ffff881ea0d3bd98: 0000000000000001 ffff881ea0d3bea0 ffff881ea0d3bda8: 00000000ffffff9c ffff881ea0d3be80 ffff881ea0d3bdb8: ffffffff811f22a7 #18 [ffff881ea0d3bdb8] user_path_at_empty at ffffffff811f22a7 ffff881ea0d3bdc0: ffff881fccf81420 ffff881e95d14c00 ffff881ea0d3bdd0: 0000001b67eb798a ffff881e7b38b031 ffff881ea0d3bde0: ffff883fca04d920 ffff883fc1e99740 ffff881ea0d3bdf0: ffff881e95d1ce38 0000000200000001 ffff881ea0d3be00: 0000000000000000 ffff881fffcb47c0 ffff881ea0d3be10: ffff881fffcb47c0 ffff881eb683a280 ffff881ea0d3be20: ffff881ea0d3be38 ffffffff810b6a15 ffff881ea0d3be30: ffff881eb683a280 ffff881ea0d3be60 ffff881ea0d3be40: ffffffff810b6a59 ffff881eb683a280 ffff881ea0d3be50: ffff881fffcb47c0 0000000033cf16ff ffff881ea0d3be60: 00007fa6c6f00980 0000000000000001 ffff881ea0d3be70: 00000000ffffff9c 00007fa6c6f00b40 ffff881ea0d3be80: ffff881ea0d3be90 ffffffff811f2311 #19 [ffff881ea0d3be88] user_path_at at ffffffff811f2311 ffff881ea0d3be90: ffff881ea0d3bee0 ffffffff811e5b53 #20 [ffff881ea0d3be98] vfs_fstatat at ffffffff811e5b53 ffff881ea0d3bea0: ffff881eb683a280 ffff881e2fec5180 ffff881ea0d3beb0: 0000000033cf16ff 00007fa6c6f00980 ffff881ea0d3bec0: 00007fa6c6f00980 000000000073ad70 ffff881ea0d3bed0: 00007fafbe2f10c8 000000003db2f41a ffff881ea0d3bee0: ffff881ea0d3bf68 ffffffff811e60be #21 [ffff881ea0d3bee8] SYSC_newstat at ffffffff811e60be ffff881ea0d3bef0: 0000000000000000 0000000100000001 ffff881ea0d3bf00: ffff881fd165d080 ffffffff8101c899 ffff881ea0d3bf10: 00000000a0d3bf38 ffff881ea0ce4000 ffff881ea0d3bf20: ffff881ea0ce4248 ffff881ea0ce4000 ffff881ea0d3bf30: ffff881ea0ce4248 ffff881ea0d30000 ffff881ea0d3bf40: ffff881ea0d3bf78 ffffffff8110d336 ffff881ea0d3bf50: 00007fafbe2f1000 0000000033cf16ff ffff881ea0d3bf60: 00007fa6c6efff50 ffff881ea0d3bf78 ffff881ea0d3bf70: ffffffff811e639e #22 [ffff881ea0d3bf70] sys_newstat at ffffffff811e639e ffff881ea0d3bf78: 00007fa6c6f00b40 ffffffff81649909 #23 [ffff881ea0d3bf80] system_call_fastpath at ffffffff81649909 RIP: 00007fb0e1d905a5 RSP: 00007fa6c6f00638 RFLAGS: 00003202 RAX: 0000000000000004 RBX: ffffffff81649909 RCX: 0000000000020000 RDX: 00007fa6c6f00980 RSI: 00007fa6c6f00980 RDI: 00007fa6c6f00b40 RBP: 00007fa6c6f00b40 R8: 0000000000000000 R9: 00007fa6c6f00c40 R10: 0000000000000000 R11: 0000000000003246 R12: ffffffff811e639e R13: ffff881ea0d3bf78 R14: 00007fa6c6efff50 R15: 0000000033cf16ff ORIG_RAX: 0000000000000004 CS: 0033 SS: 002b
下發的io:
crash> request.start_time ffff881eed49f600 start_time = 14715925391 crash> jiffies jiffies = $3 = 14715950507
可以看出,30s的超時,這個已經有25s了,也就是這個io很有可能又需要abort。
積壓的io:
request_queue.in_flight 0xffff883fc2be6a08 in_flight = {0, 15}
磁盤的ioerr 統計:
crash> scsi_device.iorequest_cnt,iodone_cnt,ioerr_cnt 0xffff881fcf17f800 iorequest_cnt = { counter = 56035857 } iodone_cnt = { counter = 56035683 } ioerr_cnt = { counter = 15749----------err次數 }
因為io_complete本來很快,那么拿到mutex的進程就會很快釋放mutex,這樣也就不會導致后來的進程排隊時間這么長了,根據這個crash的結果,有理由估計第一個crash也是類似。
結論:
1.訪問某些扇區,可能會導致多次重試而io緩慢?如果后面的io繼續來到,則可能積壓,而如果等待隊列過長,可能導致某些進程持續獲取不到鎖而hung。
2.hungtask檢測挑選的是第一個滿足條件的task 來打印,但其實有可能另外一個進程等待已經超過2個hung檢測的周期而沒有被選中,因為它排在后面,也可能因為khungtask在上一次調度的時候,大家都沒有到檢測周期,等到了下一個檢測周期之后,可能多個task 滿足條件而挑選的是第一個task,甚至有可能有的task在下一個檢測周期之間到達了閾值而因為等待的資源得到滿足之后又重新開始調度了,也就是hung的檢測是大概的,不是精確的。