在不保留core文件的情況下,如何獲取程序崩潰時候的上下文信息(主要是函數調用棧)
1.coredump原理
當程序發生內存越界訪問等行為時,會觸發OS的保護機制,此時OS會產生一個信號(signal)發送給對應的進程。當進程從內核態到用戶態切換時,該進程會處理這個信號。此類信號(比如SEGV)的默認處理行為生成一個coredump文件。
這里會涉及以下幾個問題:
1. 保存的core文件在什么地方?
2. core文件,具體會把進程地址空間的哪些內容保存下來?
3. 如何控制core文件的大小?
4. 如果在處理信號的時候,又產生了新的同類信號,該如何處理?
5. 處理信號的代碼,是運行在用戶態還是內核態?
6. 在一個多線程的程序中,是由哪個線程在處理這個信號?
/proc/sys/kernel/core_uses_pid` 取值是0或者1,表示是否在core文件名字后面加上進程號
`/proc/$pid/coredump_filter` 設置那些內存會被dump出來
- (bit 0) anonymous private memory
- (bit 1) anonymous shared memory
- (bit 2) file-backed private memory
- (bit 3) file-backed shared memory
- (bit 4) ELF header pages in file-backed private memory areas (it is effective only if the bit 2 is cleared)
- (bit 5) hugetlb private memory
- (bit 6) hugetlb shared memory
- (bit 7) DAX private memory
- (bit 8) DAX shared memory
2.自定義信號處理函數
需要在自定義的信號處理函數中打印出程序崩潰時候的活躍函數堆棧信息。
這里我們有兩種方式:
1.使用backtrace等方法,讀取進程堆棧上的信息;
2.在函數調用的同時,用戶自己維護一套數據結構,用於保存函數調用鏈,在信號處理函數中,將這個函數調用鏈打印出來

/**/ int bugsignal_register(const int sig) { struct sigaction action; memset(&action, 0, sizeof(action)); action.sa_sigaction = signal_core_bugreport; action.sa_flags = SA_SIGINFO; return (-1 != sigaction(sig, &action, NULL)); } inline static void signal_core_bugreport(const int sig, siginfo_t * info, void * ptr) { switch (sig) { case SIGSEGV: case SIGABRT: case SIGFPE: case SIGILL: case SIGBUS: { signal(sig, signal_func); write_stack_msg(sig, info, ptr,"txt"); signal(sig, SIG_DFL); kill(getpid(),sig);//讓其生成core文件 } break; case SIGTERM: case SIGINT: case SIGQUIT: { bugreport_def_return(sig) } break; case SIGUSR1: { write_stack_msg(sig, info, ptr,"sigusr1"); break; } case SIGUSR2: { write_stack_msg(sig, info, ptr,"sigusr2"); break; } default: break; } } static inline void bugreport_def_return(const int sig) { snprintf(stderr, 255, " normally exit , pid:%d, sig:%d\n", getpid(), sig); if (SIGTERM == sig || true) { signal(SIGTERM, SIG_DFL); } exit(0); } static inline int write_stack_msg(const int sig, siginfo_t * info, void * ptr,const char* logfile_suffix) { static const char * si_codes[3] = {"", "SEGV_MAPERR", "SEGV_ACCERR"}; size_t i = 0; ucontext_t * ucontext = (ucontext_t *)ptr; unsigned long stack_start = 0; unsigned long stack_end = 0; snprintf(logpath, "xxxxxxxxxxxxxxxx", buglogpath); foreach_stack_rang(gettid(), "/proc/getpid()/maps",&stack_start, &stack_end); umask(0); snprintf(logfile, xxxx, "%s/%s_time().txt", buglogpath, bug_process_name); int f = 0; Dl_info dl_info; void ** bp = 0; void * ip = 0; if (info->si_code >= 0 && info->si_code < 3) { snprintf(stackinfo, xxxxxx, "Segmentation Fault!\n" "info.si_signo = %d\n" "info.si_errno = %d\n" "info.si_code = %d (%s)\n" "info.si_pid = %d\n" "info.si_addr = %p\n", sig, info->si_errno, info->si_code, si_codes[info->si_code], info->si_pid, info->si_addr ); } else { snprintf(stackinfo,xxxxx, "Segmentation Fault!\n" "info.si_signo = %d\n" "info.si_errno = %d\n" "info.si_code = %d\n" "info.si_pid = %d\n" "info.si_addr = %p\n", sig, info->si_errno, info->si_code, info->si_pid, info->si_addr ); } write(fd, stackinfo, strlen(stackinfo)); ip = (void *)ucontext->uc_mcontext.arm_pc; bp = (void **)ucontext->uc_mcontext.arm_fp; write(fd, "REG:\n", strlen("REG:\n")); for (i = 0; i < sizeof(ucontext->uc_mcontext)/sizeof(unsigned long); i++) { fprintf(fd, "\t%s: 0x%08lx", rname_index[i], ((unsigned long*)&ucontext->uc_mcontext)[i]); } write(fd, "\nStack trace:\n\n", strlen("Stack trace:\n\n")); while (bp && ip) { if (!dladdr(ip, &dl_info)) { bugreporteak; } const char * symname = dl_info.dli_sname; fprintf(fd, "stack #%02d: bp:%p %s [%p->%p] <%s+%ld>\n", ++f,bp, dl_info.dli_fname, ip, (void*)((intptr_t)ip - (intptr_t)dl_info.dli_fbase), symname, (intptr_t)ip - (intptr_t)dl_info.dli_saddr ); ------------------------- } write(fd, "End of stack trace\n", strlen("End of stack trace\n")); save_stacktrace(fd, ucontext->uc_mcontext.arm_sp); close(fd); return 0; } int signal_bugreport_setup() { bug_signal_cb(SIGSEGV); bug_signal_cb(SIGABRT); bug_signal_cb(SIGFPE); bug_signal_cb(SIGINT); bug_signal_cb(SIGBUS); bug_signal_cb(SIGILL); bug_signal_cb(SIGQUIT); bug_signal_cb(SIGTERM); bug_signal_inore()(SIGHUP); bug_signal_inore()(SIGPIPE); //忽略sigchld 會導致system函數返回值失效不能忽略 return 0; } static inline void bugreport_def_term(const int sig) { char log[256]; snprintf(log, 255, "Exit Normally, pid:%d, sig:%d\n", getpid(), sig); printf("%s", log); if (SIGTERM == sig) { signal(SIGTERM, SIG_DFL); } exit(0); } void dump_trace(int Signal) { const int len = 200; void* buffer[len]; printf("dump_trace\n"); int nptrs = ::backtrace(buffer, len); printf("backtrace\n"); char** buffer_array = ::backtrace_symbols(buffer, nptrs); printf("sig:%d nptrs:%d\n", Signal, nptrs); if (buffer_array) { for (int i = 0; i < nptrs; ++i) { printf("frame=%d||trace_back=%s||\n", i, buffer_array[i]); } free(buffer_array); } exit(0); }
https://www.man7.org/linux/man-pages/man2/sigaction.2.html
The siginfo_t argument to a SA_SIGINFO handler When the SA_SIGINFO flag is specified in act.sa_flags, the signal handler address is passed via the act.sa_sigaction field. This han‐ dler takes three arguments, as follows: void handler(int sig, siginfo_t *info, void *ucontext) { ... } These three arguments are as follows sig The number of the signal that caused invocation of the han‐ dler. info A pointer to a siginfo_t, which is a structure containing fur‐ ther information about the signal, as described below. ucontext This is a pointer to a ucontext_t structure, cast to void *. The structure pointed to by this field contains signal context information that was saved on the user-space stack by the ker‐ nel; for details, see sigreturn(2). Further information about the ucontext_t structure can be found in getcontext(3). Com‐ monly, the handler function doesn't make any use of the third argument. The siginfo_t data type is a structure with the following fields: siginfo_t { int si_signo; /* Signal number */ int si_errno; /* An errno value */ int si_code; /* Signal code */ int si_trapno; /* Trap number that caused hardware-generated signal (unused on most architectures) */ pid_t si_pid; /* Sending process ID */ uid_t si_uid; /* Real user ID of sending process */ int si_status; /* Exit value or signal */ clock_t si_utime; /* User time consumed */ clock_t si_stime; /* System time consumed */ sigval_t si_value; /* Signal value */ int si_int; /* POSIX.1b signal */ void *si_ptr; /* POSIX.1b signal */ int si_overrun; /* Timer overrun count; POSIX.1b timers */ int si_timerid; /* Timer ID; POSIX.1b timers */ void *si_addr; /* Memory location which caused fault */ long si_band; /* Band event (was int in glibc 2.3.2 and earlier) */ int si_fd; /* File descriptor */ short si_addr_lsb; /* Least significant bit of address (since Linux 2.6.32) */ void *si_lower; /* Lower bound when address violation occurred (since Linux 3.19) */ void *si_upper; /* Upper bound when address violation occurred (since Linux 3.19) */ int si_pkey; /* Protection key on PTE that caused fault (since Linux 4.6) */ void *si_call_addr; /* Address of system call instruction (since Linux 3.5) */ int si_syscall; /* Number of attempted system call (since Linux 3.5) */ unsigned int si_arch; /* Architecture of attempted system call (since Linux 3.5) */ } si_signo, si_errno and si_code are defined for all signals. (si_errno is generally unused on Linux.) The rest of the struct may be a union, so that one should read only the fields that are meaning‐ ful for the given signal:
#include <execinfo.h> int backtrace(void **buffer, int size); char **backtrace_symbols(void *const *buffer, int size); void backtrace_symbols_fd(void *const *buffer, int size, int fd)
backtrace函數通過指針數組buffer返回調用程序的回溯信息,也就是所謂的函數調用棧。buffer數組中的元素是void*類型,也就是棧中保存的返回地址。
size參數指定buffer中可以保存的地址的最大個數。如果實際的回溯信息大於size,則只返回最近的size個地址。
backtrace函數返回buffer中保存的地址個數,返回值不會大於size。如果返回值小於size,則說明所有的回溯信息都已經返回了,如果等於size,則有可能被截斷了。
backtrace函數在buffer數組中返回的都是一些虛擬地址,不適於分析。backtrace_symbols函數可以將backtrace返回的buffer中的地址,根據符號表中的信息,轉換為字符串(函數名+偏移地址)。size參數指明了buffer中的地址個數。
backtrace_symbols返回字符串數組的首地址,該字符串是在backtrace_symbols中通過malloc分配的,因此,調用者必須使用free釋放內存。如果發生了錯誤,則backtrace_symbols返回NULL
backtrace_symbols_fd類似於backtrace_symbols,只不過它是把字符串信息寫到文件描述符fd所表示的文件中。backtrace_symbols_fd不會調用malloc函數
來自網上轉載的
#include <signal.h> #include <execinfo.h> #include <stdio.h> #include <stdlib.h> #include <ucontext.h> #define BTSIZE 100 static void *getMcontextEip(ucontext_t *uc) { #if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) /* OSX < 10.6 */ #if defined(__x86_64__) return (void*) uc->uc_mcontext->__ss.__rip; #elif defined(__i386__) return (void*) uc->uc_mcontext->__ss.__eip; #else return (void*) uc->uc_mcontext->__ss.__srr0; #endif #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6) /* OSX >= 10.6 */ #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__) return (void*) uc->uc_mcontext->__ss.__rip; #else return (void*) uc->uc_mcontext->__ss.__eip; #endif #elif defined(__linux__) /* Linux */ #if defined(__i386__) return (void*) uc->uc_mcontext.gregs[14]; /* Linux 32 */ #elif defined(__X86_64__) || defined(__x86_64__) return (void*) uc->uc_mcontext.gregs[16]; /* Linux 64 */ #elif defined(__ia64__) /* Linux IA64 */ return (void*) uc->uc_mcontext.sc_ip; #endif #else return NULL; #endif } static void sig_handler(int sig, siginfo_t *info, void *secret) { ucontext_t *uc = (ucontext_t*) secret; void *buffer[BTSIZE]; char **strings; int nptrs = 0; printf("in sig_handler\n"); printf("sig is %d, SIGSEGV is %d\n", sig, SIGSEGV); printf("info.si_signo is %d, info.si_addr is %p\n", info->si_signo, info->si_addr); if (sig == SIGSEGV) { nptrs = backtrace(buffer, BTSIZE); printf("backtrace() returned %d addresses\n", nptrs); if (getMcontextEip(uc) != NULL) buffer[1] = getMcontextEip(uc); strings = backtrace_symbols(buffer, nptrs); if (strings == NULL) { perror("backtrace_symbols"); exit(EXIT_FAILURE); } printf("backtrace: \n"); int j; for (j = 0; j < nptrs; j++) { printf("[%d]%s\n", j, strings[j]); } free(strings); exit(0); } }
#ifdef CONFIG_ARM_UNWIND static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { unwind_backtrace(regs, tsk); } #else static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { unsigned int fp, mode; int ok = 1; printk("Backtrace: "); if (!tsk) tsk = current; if (regs) { fp = frame_pointer(regs); mode = processor_mode(regs); } else if (tsk != current) { fp = thread_saved_fp(tsk); mode = 0x10; } else { asm("mov %0, fp" : "=r" (fp) : : "cc"); mode = 0x10; } if (!fp) { pr_cont("no frame pointer"); ok = 0; } else if (verify_stack(fp)) { pr_cont("invalid frame pointer 0x%08x", fp); ok = 0; } else if (fp < (unsigned long)end_of_stack(tsk)) pr_cont("frame pointer underflow"); pr_cont("\n"); if (ok) c_backtrace(fp, mode); }
注意,編譯器的優化策略,可能導致得到的回溯信息不准確。而且,對於GUN編譯器而言,必須使用-rdynamic鏈接選項( -rdynamic可用來通知鏈接器將所有符號添加到動態符號表中),才能正確解析出符號名。此時可以使用unwind方法回溯
coredump文件本身主要的格式也是ELF格式,因此,我們可以通過readelf命令進行判斷。
get_signal 這里沒判斷是不是信號是不是要觸發core dump,然后調用do_coredump
最后會調用elf_core_dump以內核代碼elf_core_dump函數為入口分析core文件怎么生成的: