關鍵詞:std::thread()、pthread_create()、mmap()、ENOMEM、EAGAIN、TASK_UNMAPPED_BASE、TASK_SIZE等等。
本文描述一個進程出現Resource temporarily unavailable,然后逐步定位到std::thread()創建失敗。接着從內核開始分析,到libpthread.so,以及借助maps進行分析,最終發現mmap()虛擬地址耗盡。
通過修改內核的mmap區域解決問題。
1. 問題描述
在程序運行一段時間后,出現了"std::system_error what(): Resource temporarily unavailable"字樣;然后進程異常退出。或者程序在進行coredump。
2. 問題分析
在程序進行coredump過程中,通過如下腳本定期查看相關信息。
檢查進程下的所有線程的maps/status/stack/syscall。
#!/bin/sh function iterate_threads_info() { pids=`pidof $1` output_file="$1.info" echo -e "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\nRecord start: "$pids >> $output_file for pid in $pids do cat /proc/$pid/maps >> $output_file for tid in `ls /proc/$pid/task/` do echo -e "################################################################################" >> $output_file echo "Pid:" $pid "->" $tid >> $output_file echo "************************** status ********************************" >> $output_file cat /proc/$pid/task/$tid/status >> $output_file echo "************************** statck ********************************" >> $output_file cat /proc/$pid/task/$tid/stack >> $output_file echo "************************** syscall ********************************" >> $output_file cat /proc/$pid/task/$tid/syscall >> $output_file echo -e "\n\n" >> $output_file done echo -e "\n\n" >> $output_file done } while true do iterate_threads_info $1 sleep $2 done
通過線程的棧找到正在進行coredump的線程,確定是此線程導致的coredump。
然后查看此線程創建的線程只存在了3個線程,應該是4個線程。所以懷疑是創建線程錯誤。
創建線程部分為:
void xxx::proc(void* rx_task, int32_t task_id) { std::thread([this, rx_task, res_handle]() { ... }).detach(); }
然后增加try...catch():
void xxx::proc(void* rx_task, int32_t task_id) { try { std::thread([this, rx_task, res_handle]() { ... }).detach(); } catch(const std::system_error &e) { std::cout << thread_count << " : " << e.code() << " meaning " << e.what() << std::endl; } }
再次運行顯示是std::thread()在創建線程的時候錯誤,錯誤碼為EAGAIN。
3. 問題根源追溯
線程創建從上到下依次是:std::thread()->pthread_create()->clone()->do_fork()。
下面從底層開始分析,找到究竟是誰返回了EAGAIN。
3.1 內核fork()相關分析
首先想到的是內核中fork()的時候,產生了EAGAIN錯誤。
內核fork()系統調用函數調用關系為do_fork()->_do_fork()->copy_process(),這里主要關注哪里返回值為EAGAIN。
long _do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr, unsigned long tls) { ... p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); if (!IS_ERR(p)) { ... } else { nr = PTR_ERR(p); } return nr; } static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { int retval; struct task_struct *p; ... retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) goto fork_out; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { printk("%s, processes=%d, RLIMIT_NPROC=%lu\n", __func__, atomic_read(&p->real_cred->user->processes), task_rlimit(p, RLIMIT_NPROC)); goto bad_fork_free; } } current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; retval = -EAGAIN; if (nr_threads >= max_threads) { printk("%s, nr_threads=%d, max_threads=%d\n", __func__, nr_threads, max_threads); goto bad_fork_cleanup_count; } ... recalc_sigpending(); if (signal_pending(current)) { retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } ... return p; ... fork_out: if(retval < 0) printk("%s comm=%s pid=%d", __func__, current->comm, current->pid); return ERR_PTR(retval); }
可以看出只要是fork()異常,都能抓到相關異常點。
經過運行,在應用抓到了Resource temporarily unavailable,但是內核並沒有抓到相關log。
3.2 線程創建std::thread()->pthread_create():線程棧的申請
pthread_create()函數在libpthread.so中,在glibc/nptl/pthread_create.c中定義了pthread_create()函數。
versioned_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1); int __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg) { STACK_VARIABLES; const struct pthread_attr *iattr = (struct pthread_attr *) attr; struct pthread_attr default_attr; bool free_cpuset = false; bool c11 = (attr == ATTR_C11_THREAD); if (iattr == NULL || c11)--------------------------------------------------c11沒有設置attr,有一套默認attr。 { ... } struct pthread *pd = NULL; int err = ALLOCATE_STACK (iattr, &pd);--------------------------------------為每個線程分配棧,棧的大小由系統rlimit設置;默認大小為8MB。 int retval = 0; if (__glibc_unlikely (err != 0)) { printf("%s(%d): ALLOCATE_STACK failed err=%x.\n", __func__, __LINE__, err);------這個內存分配失敗可能性很大,一次8MB。實際返回值為c,也即是ENOMEM。 retval = err == ENOMEM ? EAGAIN : err; goto out; } ... pd->start_routine = start_routine; pd->arg = arg; pd->c11 = c11; /* Copy the thread attribute flags. */ struct pthread *self = THREAD_SELF; pd->flags = ((iattr->flags & ~(ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) | (self->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))); pd->joinid = iattr->flags & ATTR_FLAG_DETACHSTATE ? pd : NULL; pd->eventbuf = self->eventbuf; pd->schedpolicy = self->schedpolicy; pd->schedparam = self->schedparam; ... bool stopped_start = false; bool thread_ran = false; /* Start the thread. */ if (__glibc_unlikely (report_thread_creation (pd)))----------------------------------------------啟動線程,調用內核的clone()系統調用創建線程。 { stopped_start = true; retval = create_thread (pd, iattr, &stopped_start, STACK_VARIABLES_ARGS, &thread_ran); if (retval == 0) { assert (stopped_start); assert (pd->stopped_start); pd->eventbuf.eventnum = TD_CREATE; pd->eventbuf.eventdata = pd; do pd->nextevent = __nptl_last_event; while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event, pd, pd->nextevent) != 0); __nptl_create_event (); } } else retval = create_thread (pd, iattr, &stopped_start, STACK_VARIABLES_ARGS, &thread_ran); if (__glibc_unlikely (retval != 0)) { if (thread_ran) assert (stopped_start); else { atomic_decrement (&__nptl_nthreads); if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0) == -2)) futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE); /* Free the resources. */ __deallocate_stack (pd); } /* We have to translate error codes. */ printf("%s(%d): create_thread() failed retval=%x.\n", __func__, __LINE__, retval); if (retval == ENOMEM) retval = EAGAIN; } else { if (stopped_start) lll_unlock (pd->lock, LLL_PRIVATE); THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1); } out: if (__glibc_unlikely (free_cpuset)) free (default_attr.cpuset); return retval; } # define ALLOCATE_STACK(attr, pd) \ allocate_stack (attr, pd, &stackaddr, &stacksize) static int allocate_stack (const struct pthread_attr *attr, struct pthread **pdp, ALLOCATE_STACK_PARMS) { struct pthread *pd; size_t size; size_t pagesize_m1 = __getpagesize () - 1; assert (powerof2 (pagesize_m1 + 1)); assert (TCB_ALIGNMENT >= STACK_ALIGN); /* Get the stack size from the attribute if it is set. Otherwise we use the default we determined at start time. */ if (attr->stacksize != 0) size = attr->stacksize; else { lll_lock (__default_pthread_attr_lock, LLL_PRIVATE); size = __default_pthread_attr.stacksize; lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE); } /* Get memory for the stack. */ if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))----------------------------------在flags中包含ATTR_FLAG_STACKADDR,從當前進程的stack中獲取內存。一般不定義。 { ... } else {----------------------------------------------------------------------------------------使用mmap()分配匿名內存。 /* Allocate some anonymous memory. If possible use the cache. */ size_t guardsize; size_t reqsize; void *mem; const int prot = (PROT_READ | PROT_WRITE | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0)); /* Adjust the stack size for alignment. */ size &= ~__static_tls_align_m1; assert (size != 0); guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1; if (guardsize < attr->guardsize || size + guardsize < guardsize) /* Arithmetic overflow. */ return EINVAL; size += guardsize;--------------------------------------------------------------------mmap()大小包括了stack+guard兩個區域,所以是0x00801000。 if (__builtin_expect (size < ((guardsize + __static_tls_size + MINIMAL_REST_STACK + pagesize_m1) & ~pagesize_m1), 0)) /* The stack is too small (or the guard too large). */ return EINVAL; /* Try to get a stack from the cache. */ reqsize = size; pd = get_cached_stack (&size, &mem); if (pd == NULL) { #if MULTI_PAGE_ALIASING != 0 if ((size % MULTI_PAGE_ALIASING) == 0) size += pagesize_m1 + 1; #endif mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);-----------------------------------使用mmap()系統調用,獲取一段size大小的內存。 if (__glibc_unlikely (mem == MAP_FAILED)) { printf("%s(%d):size=0x%08x, guardsize=0x%08x, prot=0x%08x, %s\n", __func__, __LINE__, size, guardsize, prot, strerror(errno));
異常log為:allocate_stack(604):size=0x00801000, guardsize=0x00001000, prot=0x00000003, Cannot allocate memory。這段內存為可讀寫,但是沒有可執行權限。這里申請內存權限為PROT_NONE。 dump_self_maps(); return errno; } assert (mem != NULL); /* Place the thread descriptor at the end of the stack. */ #if TLS_TCB_AT_TP pd = (struct pthread *) ((char *) mem + size) - 1; #elif TLS_DTV_AT_TP pd = (struct pthread *) ((((uintptr_t) mem + size - __static_tls_size) & ~__static_tls_align_m1) - TLS_PRE_TCB_SIZE); #endif if (__glibc_likely (guardsize > 0)) { char *guard = guard_position (mem, size, guardsize, pd, pagesize_m1); if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)------------------------------使用mprotect()保護guard區域,將stack區域設置為可讀寫。所以stack為rx-p,guard為---p。 { __munmap (mem, size); printf("%s(%d):%s\n", __func__, __LINE__, strerror(errno)); return errno; } } pd->stackblock = mem;---------------------------------------------------------------------------stack的起始地址。 pd->stackblock_size = size;---------------------------------------------------------------------stack的大小。 pd->guardsize = guardsize;----------------------------------------------------------------------guard大小。 pd->specific[0] = pd->specific_1stblock; pd->header.multiple_threads = 1; #ifndef TLS_MULTIPLE_THREADS_IN_TCB __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1; #endif #ifdef NEED_DL_SYSINFO SETUP_THREAD_SYSINFO (pd); #endif pd->setxid_futex = -1; ... /* Prepare to modify global data. */ lll_lock (stack_cache_lock, LLL_PRIVATE); /* And add to the list of stacks in use. */ stack_list_add (&pd->list, &stack_used); lll_unlock (stack_cache_lock, LLL_PRIVATE); if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0 && (prot & PROT_EXEC) == 0, 0)) { int err = change_stack_perm (pd #ifdef NEED_SEPARATE_REGISTER_STACK , ~pagesize_m1 #endif ); if (err != 0) { /* Free the stack memory we just allocated. */ (void) __munmap (mem, size); printf("%s(%d):%s\n", __func__, __LINE__, strerror(errno)); return err; } } } ... pd->reported_guardsize = guardsize; } /* Initialize the lock. We have to do this unconditionally since the stillborn thread could be canceled while the lock is taken. */ pd->lock = LLL_LOCK_INITIALIZER; /* The robust mutex lists also need to be initialized unconditionally because the cleanup for the previous stack owner might have happened in the kernel. */ pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock) - offsetof (pthread_mutex_t, __data.__list.__next)); pd->robust_head.list_op_pending = NULL; #if __PTHREAD_MUTEX_HAVE_PREV pd->robust_prev = &pd->robust_head; #endif pd->robust_head.list = &pd->robust_head; /* We place the thread descriptor at the end of the stack. */ *pdp = pd; ... return 0; }
可以看出pthread_create()通過mmap()從內核中申請了stack+guard內存,然后將stackaddr和stacksize作為參數給內核。
內核創建的線程棧使用用戶空間分配內存。
實際測試發現確實是在allocate_stack()是返回了ENOMEM,然后ENOMEM被轉換成了EAGAIN。
所以std::thread()收到的是EAGAIN錯誤。
3.3 線程棧申請失敗根源
通過top和/proc/meminfo發覺內存都還有余量,但是恰恰此時出現ENOMEM。所以就從maps入手。
3.3.1 抓取線程棧申請失敗現場的maps
在allocate_stack()失敗之后,調用如下函數抓取當前線程的maps。
#include <sys/prctl.h> #define SELF_MAPS "/proc/self/maps" void dump_self_maps(void) { FILE *self_fd, *output_fd; char buf[1024], output_name[32], process_name[16]; memset(buf, 0x0, sizeof(buf)); memset(output_name, 0x0, sizeof(output_name)); prctl(PR_GET_NAME, process_name); snprintf(output_name, sizeof(output_name), "/tmp/maps_%s_%d.txt", process_name, getpid()); self_fd=fopen(SELF_MAPS,"r"); if(self_fd==NULL) { perror("open file"); exit(0); } output_fd=fopen(output_name,"w"); if(output_fd==NULL) { perror("open file"); exit(0); } while(fgets(buf,sizeof(buf),self_fd)!=NULL) { fputs(buf,output_fd); //printf("%s", buf); } fclose(output_fd); fclose(self_fd); }
maps的詳細分析見:《/proc/xxx/maps簡要記錄》
3.3.2 分析maps為何申請失敗
其實在過程中一直通過/proc/meminfo和top查看系統總內存和線程虛擬內存總量,都還有不少余量。
這里就必須要看maps為什么會申請失敗?
00008000-0001d000 r-xp 00000000 b3:01 1102 /usr/bin/xchip_runtime 0001e000-00020000 r--p 00015000 b3:01 1102 /usr/bin/xchip_runtime 00020000-00021000 rw-p 00017000 b3:01 1102 /usr/bin/xchip_runtime 00021000-00cdd000 rwxp 00000000 00:00 0 [heap] 2aaa8000-2aac5000 r-xp 00000000 b3:01 957 /lib/ld-2.28.9000.so 2aac5000-2aac6000 r--p 0001c000 b3:01 957 /lib/ld-2.28.9000.so 2aac6000-2aac7000 rw-p 0001d000 b3:01 957 /lib/ld-2.28.9000.so 2aac7000-2aac8000 r-xp 00000000 00:00 0 [vdso] 2aac8000-2aaca000 rw-p 00000000 00:00 0 2aaca000-2ab0a000 r-xp 00000000 b3:01 1104 /usr/lib/libnncv_engine.so 2ab0a000-2ab0b000 ---p 00040000 b3:01 1104 /usr/lib/libnncv_engine.so 2ab0b000-2ab0e000 r--p 00040000 b3:01 1104 /usr/lib/libnncv_engine.so ... 7ea00000-7eaff000 rw-p 00000000 00:00 0 7eaff000-7eb00000 ---p 00000000 00:00 0 7eb00000-7ebff000 rw-p 00000000 00:00 0 7ebff000-7ec00000 ---p 00000000 00:00 0 7ec00000-7ecfa000 rw-p 00000000 00:00 0 7ecfa000-7ed00000 ---p 00000000 00:00 0 7ee00000-7eef6000 rw-p 00000000 00:00 0 7eef6000-7ef00000 ---p 00000000 00:00 0 7ef00000-7ef01000 ---p 00000000 00:00 0 7ef01000-7f701000 rw-p 00000000 00:00 0 7f800000-7f900000 rw-p 00000000 00:00 0 7fe58000-7fe79000 rwxp 00000000 00:00 0 [stack]
通過如下腳本對maps進行分析,可讀性更強。
import re import pandas as pd maps_list=[] maps_file = open('./maps_aie_thd_165.txt', 'rb') maps_columns = ["start", "end", "size(KB)", "filename"] maps_process_end='80000000' pre_end=0 for line in maps_file: #00008000-0000b000 r-xp 00000000 b3:01 1023 /root/pidmax #0000b000-0000c000 r--p 00002000 b3:01 1023 /root/pidmax #0000c000-0000d000 rw-p 00003000 b3:01 1023 /root/pidmax maps_line_fmt = '(?P<start>.{8})-(?P<end>.{8}) (?P<permission>.{4}) (?P<size>.{8}) (?P<major>.{2}):(?P<minor>.{2}) (?P<handle>[0-9]*) *(?P<filename>.*)' m = re.match(maps_line_fmt, line) if(not m): continue start = m.group('start') end = m.group('end') permission = m.group('permission') #size = m.group('size') #major = m.group('major') #minor = m.group('minor') #handle = m.group('handle') filename = m.group('filename') start_int = int(start, 16) end_int = int(end, 16) if(pre_end != start_int): maps_list.append([ "{:0>8x}".format(pre_end), start, (start_int - pre_end)/1024, "NOT USED"]) #print start+','+end+','+size+','+permission+','+filename maps_list.append([start, end, (end_int - start_int)/1024, filename]) pre_end = end_int maps_file.close() maps_list.append([end, maps_process_end, (int(maps_process_end, 16) - end_int)/1024, "NOT USED"]) maps_pd = pd.DataFrame(columns=maps_columns, data=maps_list) maps_pd.to_csv("maps.csv", encoding='gb2312') print 'Total memory =', maps_pd['size(KB)'].sum()/1024,'(MB)'
解析結果如下:
實際虛擬地址使用1340MB左右,按照2GB大小,還剩余600+MB。
可以看出有一塊內存區域未使用。
經研究這塊內存在heap和庫文件之間,這應該是預留給heap的區域。
這塊區域可以縮小,然后在進行測試。
4. 解決問題(增大mmap區域)
要增大mmap區域,首先需要了解內核中mmap區域是如何分配的。
#define TASK_SIZE 0x7fff8000UL#define TASK_UNMAPPED_BASE (TASK_SIZE / 3)
經研究發現TASK_UNMAPPED_BASE為TASK_SIZE/3,即0x2AAA8000。
所以mmap是從0x2AAA8000開始申請,這也是heap之后和ld.so之間有這么大未使用空間的原因。
在創建新進程的時候,setup_new_exec()設置了mmap區域。
void setup_new_exec(struct linux_binprm * bprm) { arch_pick_mmap_layout(current->mm); /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; ... } void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = 0; info.length = len; info.low_limit = mm->mmap_base;--------------------------mmap區域的范圍[mm->mmap_base, TASK_SIZE]。 info.high_limit = TASK_SIZE; info.align_mask = 0; return vm_unmapped_area(&info); } unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ if (len > TASK_SIZE) return -ENOMEM; get_area = current->mm->get_unmapped_area; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { pgoff = 0; get_area = shmem_get_unmapped_area; } addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; error = security_mmap_addr(addr); return error ? error : addr; }
進程線程在創建的時候都規定了mmap的區域,並且在mmap的時候同樣進行了規定。
所以只需要修改TASK_UNMAPPED_BASE即可。
#define TASK_UNMAPPED_BASE 0x10000000// Change from 0x2aaa8000(TASK_SIZE / 3) to 0x10000000.
重新運行內存之后,發現mmap區域變大了許多。線程創建成功,問題解決。
4.1 修改pthread stacksize減少棧空間
pthread_create()可以修改,但是std::thread()創建的線程沒有修改stacksize的接口。
所以使用std::thread()創建的線程默認使用ulimit -s大小的線程。這個線程棧大小是系統規定的,可以通過ulimit -x <size>來修改。
使用pthread_attr_setstacksize()來修改線程棧大小:
void* foo(void* arg); . . . . pthread_attr_t attribute; pthread_t thread; pthread_attr_init(&attribute); pthread_attr_setstacksize(&attribute,1024);----------------設置線程棧的大小。 pthread_create(&thread,&attribute,foo,0); pthread_join(thread,0);
對於C++來說,可以使用boost::thread::attributes來設置所創建的線程棧的大小。
boost::thread::attributes attrs; attrs.set_size(4096*10); boost::thread myThread(attrs, fooFunction, 42);
參考文檔:《c++ - pthread - thread stack size limit》、《How to set the stacksize with C++11 std::thread》