linux的crash之hardlock排查記錄


3.10.0-327的內核,crash記錄如下:

KERNEL: vmlinux
DUMPFILE: vmcore [PARTIAL DUMP]
CPUS: 48
DATE: Wed Oct 18 20:37:18 2017
UPTIME: 1 days, 09:43:06
LOAD AVERAGE: 13.42, 10.66, 9.48
TASKS: 7329
NODENAME: host-10-229-143-10
RELEASE: 3.10.0-327.22.2.el7.x86_64
VERSION: #1 SMP Fri Sep 29 15:13:08 CST 2017
MACHINE: x86_64 (2199 Mhz)
MEMORY: 383.6 GB
PANIC: "Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 10"
PID: 24023
COMMAND: "fas_readwriter"
TASK: ffff882f460a2e00 [THREAD_INFO: ffff882f10c44000]
CPU: 10
STATE: TASK_RUNNING (PANIC)-----------------------------------------R狀態死鎖,進程長時間處於TASK_RUNNING 狀態搶占CPU而不發生切換,一般是,進程關搶占后一直執行任務,或者進程關搶占后處於死循環或者睡眠,此時往往會導致多個CPU互鎖,整個系統異常。

crash> bt
PID: 24023 TASK: ffff882f460a2e00 CPU: 10 COMMAND: "fas_readwriter"
#0 [ffff882fbfd459c8] machine_kexec at ffffffff81051c5b
#1 [ffff882fbfd45a28] crash_kexec at ffffffff810f3ec2
#2 [ffff882fbfd45af8] panic at ffffffff816326d1
#3 [ffff882fbfd45b78] watchdog_overflow_callback at ffffffff8111d0e2
#4 [ffff882fbfd45b88] __perf_event_overflow at ffffffff811608d1
#5 [ffff882fbfd45c00] perf_event_overflow at ffffffff811613a4
#6 [ffff882fbfd45c10] intel_pmu_handle_irq at ffffffff81032628
#7 [ffff882fbfd45e60] perf_event_nmi_handler at ffffffff81642bcb
#8 [ffff882fbfd45e80] nmi_handle at ffffffff81642319
#9 [ffff882fbfd45ec8] do_nmi at ffffffff81642430
#10 [ffff882fbfd45ef0] end_repeat_nmi at ffffffff81641753
[exception RIP: put_compound_page+336]-----------------------------------------這個RIP
RIP: ffffffff81178b60 RSP: ffff882f10c47d80 RFLAGS: 00000006
RAX: 006016c60138402c RBX: ffffea0123302a40 RCX: 0000000000000022
RDX: 0000000000000246 RSI: 000000000a6a9000 RDI: ffffea0123300000
RBP: ffff882f10c47d98 R8: ffff882f10c47dc8 R9: ffff882f10c47d74
R10: ffff880000000298 R11: 000000000a6aa000 R12: ffffea0123300000
R13: 0000000000000246 R14: 0000000000000000 R15: ffffea0123302a40
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
--- <NMI exception stack> ---
#11 [ffff882f10c47d80] put_compound_page at ffffffff81178b60
#12 [ffff882f10c47da0] put_page at ffffffff81178bac
#13 [ffff882f10c47db0] get_futex_key at ffffffff810e3c86
#14 [ffff882f10c47e08] futex_wake at ffffffff810e3f1a
#15 [ffff882f10c47e70] do_futex at ffffffff810e6a12
#16 [ffff882f10c47f08] sys_futex at ffffffff810e6f20
#17 [ffff882f10c47f80] system_call_fastpath at ffffffff81649909

 

首先,一般hardlock的觸發原因是因為關中斷時間太長,那么需要查找對應的堆棧中是否有如此處理,而關中斷的常見函數,如spinlock,irq_disable等。

根據堆棧,get_futex_key有一段代碼如下:

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
page_head = page;
if (unlikely(PageTail(page))) {
put_page(page);
/* serialize against __split_huge_page_splitting() */
local_irq_disable();-------------------------------------------------------------------------關中斷
if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {------------------調用了__get_user_pages_fast
page_head = compound_head(page);
/*
* page_head is valid pointer but we must pin
* it before taking the PG_lock and/or
* PG_compound_lock. The moment we re-enable
* irqs __split_huge_page_splitting() can
* return and the head page can be freed from
* under us. We can't take the PG_lock and/or
* PG_compound_lock on a page that could be
* freed from under us.
*/
if (page != page_head) {
get_page(page_head);
put_page(page);
}
local_irq_enable();
} else {
local_irq_enable();
goto again;
}
}
#else
page_head = compound_head(page);
if (page != page_head) {
get_page(page_head);
put_page(page);
}
#endif

確定下CONFIG_TRANSPARENT_HUGEPAGE 是否配置了:

grep CONFIG_TRANSPARENT_HUGEPAGE /boot/config-3.10.0-327.22.2.el7.x86_64
CONFIG_TRANSPARENT_HUGEPAGE=y

說明已經配置了,反匯編get_futex_key確認下,通過簡單搜索__get_user_pages_fast是否編譯確認了確實開啟了透明巨頁。

下一步,需要分析,為什么put_page調用put_compound_page會長時間不返回。

void put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
__put_single_page(page);
}

首先需要獲取page的值,這個是個指針,通過反匯編get_futex_key函數,

0xffffffff810e3b53 <get_futex_key+227>: je 0xffffffff810e3c10 <get_futex_key+416>
0xffffffff810e3b59 <get_futex_key+233>: lea -0x38(%rbp),%rcx
0xffffffff810e3b5d <get_futex_key+237>: mov $0x1,%edx
0xffffffff810e3b62 <get_futex_key+242>: mov $0x1,%esi
0xffffffff810e3b67 <get_futex_key+247>: mov %r12,%rdi
0xffffffff810e3b6a <get_futex_key+250>: callq 0xffffffff810653c0 <get_user_pages_fast>
0xffffffff810e3b6f <get_futex_key+255>: cmp $0xfffffff2,%eax
0xffffffff810e3b72 <get_futex_key+258>: jne 0xffffffff810e3b00 <get_futex_key+144>
0xffffffff810e3b74 <get_futex_key+260>: cmpb $0x0,-0x39(%rbp)
0xffffffff810e3b78 <get_futex_key+264>: je 0xffffffff810e3b00 <get_futex_key+144>
0xffffffff810e3b7a <get_futex_key+266>: lea -0x38(%rbp),%rcx -------------取page的地址,然后傳參數
0xffffffff810e3b7e <get_futex_key+270>: xor %edx,%edx
0xffffffff810e3b80 <get_futex_key+272>: mov $0x1,%esi
0xffffffff810e3b85 <get_futex_key+277>: mov %r12,%rdi
0xffffffff810e3b88 <get_futex_key+280>: mov $0x1,%r14d
0xffffffff810e3b8e <get_futex_key+286>: callq 0xffffffff810653c0 <get_user_pages_fast>
0xffffffff810e3b93 <get_futex_key+291>: test %eax,%eax
0xffffffff810e3b95 <get_futex_key+293>: jns 0xffffffff810e3b08 <get_futex_key+152>

 在調用get_user_pages_fast前,需要獲取page指針用來傳參數,所以 -0x38(%rbp)肯定是page指針存放的地址,注意是page指針這個變量的地址,不是page指向的變量的地址。

查看堆棧:bt -f查看堆棧信息:

#13 [ffff882f10c47db0] get_futex_key at ffffffff810e3c86
ffff882f10c47db8: ffff885f706ae400 01ffc9004039e688
ffff882f10c47dc8: ffffea0123302a40 00000000dfe458bb
ffff882f10c47dd8: 000000000a6a9b28 00000000ffffffff
ffff882f10c47de8: 0000000000000001 000000000a6a9b28
ffff882f10c47df8: 0000000000000001 ffff882f10c47e68 ---------------------rbp就存放在這,因為它是除了返回地址之外第一個壓棧的。
ffff882f10c47e08: ffffffff810e3f1a
#14 [ffff882f10c47e08] futex_wake at ffffffff810e3f1a---------------------這個是get_futex_key 返回上一層函數的地址,
ffff882f10c47e10: ffffc9004039e680 ffffc9004039e684
ffff882f10c47e20: 0000000000000000 0000000000000000
ffff882f10c47e30: 0000000000000b28 00000000dfe458bb
ffff882f10c47e40: 000000000a6a9b28 0000000000000001
ffff882f10c47e50: 0000000000000035 000000000a6a9b28

rbp存放在ffff882f10c47e00就是rbp存放的地方,然后根據:

0xffffffff810e3a75 <get_futex_key+5>: push %rbp
0xffffffff810e3a76 <get_futex_key+6>: mov %rsp,%rbp-----------------此時rsp就是rbp的值,而rsp此時指向棧頂,即ffff882f10c47e00.

crash> p /x 0xffff882f10c47e00-0x38
$3 = 0xffff882f10c47dc8

通過堆棧可以看出0xffff882f10c47dc8 位置的內容為ffffea0123302a40,它其實就是我們尋找的page指針。

crash> struct page -x ffffea0123302a40
struct page {
flags = 0x6fffff00008000,---------------這個標志,剛好包含tail標志。

使用kmem確定下:

crash> kmem ffffea0123302a40
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea0123302a40 48cc0a9000 0 0 0 6fffff00008000 tail-----------------有tail標志

獲取位於head的page

crash> struct page.first_page -x ffffea0123302a40
first_page = 0xffffea0123300000
crash> struct page -x 0xffffea0123300000
struct page {
flags = 0x6016c60138402c,

crash> kmem 0xffffea0123300000------------------確認當時head的標志如下:
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea0123300000 48cc000000 ffff882f74a96701 a600 74 6016c60138402c referenced,uptodate,lru,head,swapbacked,unevictable,mlocked,compound_lock

 

下面開始查看put_compound_page的代碼。

static void put_compound_page(struct page *page)
{
struct page *page_head;

/*
* We see the PageCompound set and PageTail not set, so @page maybe:
* 1. hugetlbfs head page, or
* 2. THP head page.
*/
if (likely(!PageTail(page))) {
if (put_page_testzero(page)) {
/*
* By the time all refcounts have been released
* split_huge_page cannot run anymore from under us.
*/
if (PageHead(page))
__put_compound_page(page);
else
__put_single_page(page);
}
return;
}

/*
* We see the PageCompound set and PageTail set, so @page maybe:
* 1. a tail hugetlbfs page, or
* 2. a tail THP page, or
* 3. a split THP page.
*
* Case 3 is possible, as we may race with
* __split_huge_page_refcount tearing down a THP page.
*/
page_head = compound_head_by_tail(page);page_head 即為:0xffffea0123300000
if (!__compound_tail_refcounted(page_head))-------------將首頁傳入作為參數,
put_unrefcounted_compound_page(page_head, page);
else
put_refcounted_compound_page(page_head, page);-----------------最終走的這個流程
}

從代碼可以看出,他有兩個流程,我們屬於第二種,tail頁。

解析來,需要判斷是走 put_unrefcounted_compound_page 還是走 put_refcounted_compound_page。

static inline bool __compound_tail_refcounted(struct page *page)
{
return !PageSlab(page) && !PageHeadHuge(page);
}

首頁的slab標志沒有,需要查看PageHeadHuge ,代碼如下:

int PageHeadHuge(struct page *page_head)
{
compound_page_dtor *dtor;

if (!PageHead(page_head))
return 0;

dtor = get_compound_page_dtor(page_head);-----------

return dtor == free_huge_page;
}

 

static inline compound_page_dtor *get_compound_page_dtor(struct page *page)--------------獲取page的析構函數
{
return (compound_page_dtor *)page[1].lru.next;
}

crash> p ((struct page*)0xffffea0123300000 +1)
$5 = (struct page *) 0xffffea0123300040-------------先偏移一個指針

crash> struct page.lru.next 0xffffea0123300040
lru.next = 0xffffffff81173140 <free_compound_page>,

free_compound_page與free_huge_page 不是同一個函數,所以 PageHeadHuge 返回false,

那么 __compound_tail_refcounted返回true,所以下面需要分析 put_refcounted_compound_page 流程的代碼。

 

void put_refcounted_compound_page(struct page *page_head, struct page *page)
{
if (likely(page != page_head && get_page_unless_zero(page_head))) {
unsigned long flags;

 

/*
* Try to grab a ref unless the page has a refcount of zero, return false if
* that is the case.
*/
static inline int get_page_unless_zero(struct page *page)
{
return atomic_inc_not_zero(&page->_count);---------------page_head->_count為74
}

crash> struct page._count 0xffffea0123300000
_count = {
counter = 74
}

那么,當時函數到底執行到哪一行被nmi狗咬死的呢?我們回過頭看這個RIP,

[exception RIP: put_compound_page+336]-----------------------------------------這個RIP

反匯編結果:

/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/arch/x86/include/asm/processor.h: 689
0xffffffff81178b5a <put_compound_page+330>: pause
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/arch/x86/include/asm/bitops.h: 319
0xffffffff81178b5c <put_compound_page+332>: mov (%r12),%rax
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/bit_spinlock.h: 30
0xffffffff81178b60 <put_compound_page+336>: test $0x1000000,%eax

說明調用了bit_spin_lock。

根據之前打印的flag:

crash> kmem 0xffffea0123300000------------------確認當時head的標志如下:
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea0123300000 48cc000000 ffff882f74a96701 a600 74 6016c60138402c referenced,uptodate,lru,head,swapbacked,unevictable,mlocked,compound_lock

說明PG_compound_lock已經被設置,假設已經被設置的鎖,這次再次自旋,豈不是死鎖了?倒是挺符合故障現象的,那么,怎么確定這把鎖是進來自來獲取的,

還是進來之后獲取的呢?也就是RIP執行的時候,到底執行到put_refcounted_compound_page下面的哪一行呢?

 

通過走查其他的堆棧,發現很多線程都是一模一樣的堆棧,都在等待這把鎖,唯獨有一個不一樣:

收集堆棧信息:

crash> bt -a >caq_bt_all.txt 

過濾put_compound_page 打印如下:

#11 [ffff882f10c47d80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+336]
#4 [ffff882ef1d1bd80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+336]
#4 [ffff882ef29fbd80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+336]
#4 [ffff882ef2d97d80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+332]
#4 [ffff882ef6d93d80] put_compound_page at ffffffff81178b5c
[exception RIP: put_compound_page+332]
#4 [ffff882ef2cb3d80] put_compound_page at ffffffff81178b5c
[exception RIP: put_compound_page+336]
#4 [ffff882ef1eafd80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+332]

然后獲取堆棧具體內容:

[root@host-10-229-143-10 127.0.0.1-2017-10-18-20:38:02]# grep put_compound_page -B 10 -A 5 caq_bt_all.txt |grep fas_readwriter |wc -l
34
[root@host-10-229-143-10 127.0.0.1-2017-10-18-20:38:02]# grep put_compound_page -B 10 -A 5 caq_bt_all.txt |grep put_compound_page |wc -l
69

說明有一個堆棧是使用了put_compound_page ,但是rip不是put_compound_page ,找到這個task堆棧如下:

crash> bt 24194
PID: 24194 TASK: ffff882ef4e2e780 CPU: 3 COMMAND: "fas_readwriter"
#0 [ffff882ef3927c00] __schedule at ffffffff8163df9b------------------------------------被調度出去了。
#1 [ffff882ef3927c38] put_compound_page at ffffffff81178af5  ---------------------被調度出去的返回地址
#2 [ffff882ef3927c60] put_page at ffffffff81178bac
#3 [ffff882ef3927c70] get_futex_key at ffffffff810e3cfa
#4 [ffff882ef3927cc8] futex_wait_setup at ffffffff810e46fd
#5 [ffff882ef3927d28] futex_wait at ffffffff810e4940
#6 [ffff882ef3927e70] do_futex at ffffffff810e69ee
#7 [ffff882ef3927f08] sys_futex at ffffffff810e6f20
#8 [ffff882ef3927f80] system_call_fastpath at ffffffff81649909
RIP: 00007f544487379b RSP: 00007f4a59874868 RFLAGS: 00003206
RAX: 00000000000000ca RBX: ffffffff81649909 RCX: 0000000000010000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000a6a9b28
RBP: 000000000a6a9b28 R8: 0000000000000000 R9: 0000000000005e82
R10: 0000000000000000 R11: 0000000000003246 R12: 00007f5311b49000
R13: 00007f4a59874cd0 R14: 00007f5301dfb000 R15: fffffffeffffffff
ORIG_RAX: 00000000000000ca CS: 0033 SS: 002b

crash> dis -l ffffffff81178af5 4    -----查看從哪被調度出去的,從下面信息看,還是自旋鎖,和上面hardlock的鎖一樣。
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/mm.h: 376
0xffffffff81178af5 <put_compound_page+229>: mov %rax,%r13
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/arch/x86/include/asm/bitops.h: 199
0xffffffff81178af8 <put_compound_page+232>: lock btsl $0x18,(%r12)
0xffffffff81178aff <put_compound_page+239>: sbb %eax,%eax
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/bit_spinlock.h: 26
0xffffffff81178b01 <put_compound_page+241>: test %eax,%eax

crash> struct task_struct.stack ffff882ef4e2e780
stack = 0xffff882ef3924000

stack變量存儲的是thread_info信息

crash> struct thread_info -x 0xffff882ef3924000
struct thread_info {
task = 0xffff882ef4e2e780,
exec_domain = 0xffffffff8197be20 <default_exec_domain>,
flags = 0x88,--------------------換算成2進制就是0x10001000

#define TIF_NEED_RESCHED 3 /* rescheduling necessary */說明設置了這個標志。

#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 說明設置了這個標志。

 在__schedule函數中,會調用clear_tsk_need_resched來清除這個標志,說明當時還沒走到這個。

 

經os同事文洋查看新的內核代碼:

commit ddc58f27f9eee9117219936f77e90ad5b2e00e96
Author: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Date: Fri Jan 15 16:52:56 2016 -0800

mm: drop tail page refcounting

Tail page refcounting is utterly complicated and painful to support.

It uses ->_mapcount on tail pages to store how many times this page is
pinned. get_page() bumps ->_mapcount on tail page in addition to
->_count on head. This information is required by split_huge_page() to
be able to distribute pins from head of compound page to tails during
the split.

We will need ->_mapcount to account PTE mappings of subpages of the
compound page. We eliminate need in current meaning of ->_mapcount in
tail pages by forbidding split entirely if the page is pinned.

The only user of tail page refcounting is THP which is marked BROKEN for
now.

Let's drop all this mess. It makes get_page() and put_page() much
simpler.

 代碼被重構了,問題記錄到此結束。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM