專題:Linux進程管理專題
目錄:
關鍵詞:swapper、init_task、fork。
Linux內核通常把進程叫作任務,進程控制塊(PCB Processing Control Block)用struct task_struct表示。
線程是輕量級進程,是操作系統做小調度單元,一個進程可以擁有多個線程。
線程之所以被稱為輕量級,是因為共享進程的資源空間。線程和進程使用相同的進程PCB數據結構。
內核使用clone方法創建線程,類似於fork方法,但會確定哪些資源和父進程共享,哪些資源為線程獨享。
1. init進程
init進程也稱為swapper進程或者idle進程,是在Linux啟動是的第一個進程。
idle進程在內核啟動(start_kernel())時靜態創建,所有的核心數據結構都靜態賦值。
當系統沒有進程需要調度時,調度器就會執行idle進程。
start_kernel ->rest_init ->cpu_startup_entry ->cpu_idle_loop
1.1 init_task
init_task進程的task_struct數據結構通過INIT_TASK宏來賦值。
/* Initial task structure */ struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task);
INIT_TASK用來填充init_task數據結構。
#define INIT_TASK(tsk) \ { \ .state = 0, \ .stack = &init_thread_info, \-------#define init_thread_info (init_thread_union.thread_info) .usage = ATOMIC_INIT(2), \ .flags = PF_KTHREAD, \----------表明是一個內核線程 .prio = MAX_PRIO-20, \----------MAX_PRIO為140,此處prio為120,對應的nice值為0.關於prio和nice參考:prio和nice之間的關系。 .static_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \-------調度策略是SCHED_NORMAL。 .cpus_allowed = CPU_MASK_ALL, \ .nr_cpus_allowed= NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \------------idle進程的內存管理結構數據 .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ .se = { \ .group_node = LIST_HEAD_INIT(tsk.se.group_node), \ }, \ .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ .time_slice = RR_TIMESLICE, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ INIT_PUSHABLE_TASKS(tsk) \ INIT_CGROUP_SCHED(tsk) \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ .real_parent = &tsk, \ .parent = &tsk, \ .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ RCU_POINTER_INITIALIZER(real_cred, &init_cred), \ RCU_POINTER_INITIALIZER(cred, &init_cred), \ .comm = INIT_TASK_COMM, \ .thread = INIT_THREAD, \ .fs = &init_fs, \ .files = &init_files, \ .signal = &init_signals, \ .sighand = &init_sighand, \ .nsproxy = &init_nsproxy, \ .pending = { \ .list = LIST_HEAD_INIT(tsk.pending.list), \ .signal = {{0}}}, \ .blocked = {{0}}, \ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ }, \ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ INIT_IDS \ INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ INIT_TASK_RCU_TASKS(tsk) \ INIT_CPUSET_SEQ(tsk) \ INIT_RT_MUTEXES(tsk) \ INIT_PREV_CPUTIME(tsk) \ INIT_VTIME(tsk) \ INIT_NUMA_BALANCING(tsk) \ INIT_KASAN(tsk) \ }
1.2 thread_info、thread_union、task_struct關系
thread_union包括thread_info和內核棧;
task_struct的stack指向init_thread_union.thread_info。
1.2.1 init_thread_info
init_thread_info被__init_task_data修飾,所以它會被固定在.data..init_task段中。
/* * Initial thread structure. Alignment of this is handled by a special * linker map entry. */ union thread_union init_thread_union __init_task_data = { INIT_THREAD_INFO(init_task) }; #define __init_task_data __attribute__((__section__(".data..init_task")))
下面看看.data..init_task段,在vmlinux.lds.S鏈接文件中定義了大小和位置。
可以看出在_data開始的地方保留了一塊2頁大小的空間,存放init_task_info。
SECTIONS { ... .data : AT(__data_loc) { _data = .; /* address in memory */ _sdata = .; /* * first, the init task union, aligned * to an 8192 byte boundary. */ INIT_TASK_DATA(THREAD_SIZE)------------------------------存放在_data開始地方,2頁大小,即8KB。 ... _edata = .; } _edata_loc = __data_loc + SIZEOF(.data); ... } #define INIT_TASK_DATA(align) \ . = ALIGN(align); \ *(.data..init_task) #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define THREAD_START_SP (THREAD_SIZE - 8)
init_thread_info是thread_union聯合體,被固定為8KB大小。
union thread_union {
struct thread_info thread_info;
unsigned long stack[THREAD_SIZE/sizeof(long)]; };
init_thread_info中包含了struct thread_info類型數據結構,它是由INIT_THREAD_INFO進行初始化。
struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ __u32 cpu; /* cpu */ __u32 cpu_domain; /* cpu domain */ struct cpu_context_save cpu_context; /* cpu context */ __u32 syscall; /* syscall number */ __u8 used_cp[16]; /* thread used copro */ unsigned long tp_value[2]; /* TLS registers */ #ifdef CONFIG_CRUNCH struct crunch_state crunchstate; #endif union fp_state fpstate __attribute__((aligned(8))); union vfp_state vfpstate; #ifdef CONFIG_ARM_THUMBEE unsigned long thumbee_state; /* ThumbEE Handler Base register */ #endif }; #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ .exec_domain = &default_exec_domain, \ .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \ domain_val(DOMAIN_IO, DOMAIN_CLIENT), \ }
1.2.2 init_task內核棧
ARM32處理器從匯編跳轉到C語言的入口點start_kernel()函數之前,設置了SP寄存器指向8KB內核棧頂部區域,其中預留了8B空洞。
/* * The following fragment of code is executed with the MMU on in MMU mode, * and uses absolute addresses; this is not position independent. * * r0 = cp#15 control register * r1 = machine ID * r2 = atags/dtb pointer * r9 = processor ID */ __INIT __mmap_switched: adr r3, __mmap_switched_data ldmia r3!, {r4, r5, r6, r7} ... ARM( ldmia r3, {r4, r5, r6, r7, sp}) THUMB( ldmia r3, {r4, r5, r6, r7} ) THUMB( ldr sp, [r3, #16] ) ... b start_kernel------------------------------------------------跳轉到start_kernel函數 ENDPROC(__mmap_switched) .align 2 .type __mmap_switched_data, %object __mmap_switched_data: .long __data_loc @ r4 .long _sdata @ r5 .long __bss_start @ r6 .long _end @ r7 .long processor_id @ r4 .long __machine_arch_type @ r5 .long __atags_pointer @ r6 #ifdef CONFIG_CPU_CP15 .long cr_alignment @ r7 #else .long 0 @ r7 #endif .long init_thread_union + THREAD_START_SP @ sp-----------------定義了SP寄存器的值,指向8KB棧空間頂部。 .size __mmap_switched_data, . - __mmap_switched_data
1.2.3 從sp到current逆向查找
內核中用一個current常量獲取當前進程task_structg數據結構,從sp到current的流程如下:
- 通過SP寄存器獲取當前內核棧指針。
- 棧指針對齊后獲取struct thread_info數據結構指針
- 通過thread_info->task成員獲取task_struct數據結構
可以和內核棧示意圖結合看。
#define get_current() (current_thread_info()->task) #define current get_current() /* * how to get the current stack pointer in C */ register unsigned long current_stack_pointer asm ("sp"); /* * how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) __attribute_const__; static inline struct thread_info *current_thread_info(void) { return (struct thread_info *) (current_stack_pointer & ~(THREAD_SIZE - 1)); }
2. fork
Linux通過fork、vfork、clone等系統調用來建立線程或進程,在內核中這三個系統調用都通過一個函數來實現,即do_fork()。也包括內核線程kernel_thread。
do_fork定義在fork.c中,下面四個封裝接口的區別就在於其傳遞的參數。
/* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } SYSCALL_DEFINE0(fork) { return do_fork(SIGCHLD, 0, 0, NULL, NULL); } SYSCALL_DEFINE0(vfork) { return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int, tls_val, int __user *, child_tidptr) { return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); }
fork只使用用了SIGCHLD標志位在紫禁城終止后發送SIGCHLD信號通知父進程。fork是重量級應用,為子進程建立了一個基於父進程的完整副本,然后子進程基於此運行。
但是采用了COW技術,子進程只復制父進程頁表,而不復制頁面內容。當子進程需要寫入內容時才觸發寫時復制機制,為子進程創建一個副本。
vfork比fork多了連個標志位:CLONE_VFORK表示父進程會被掛起,直至子進程釋放虛擬內存資源;CLONE_VM表示父子進程運行在相同的內存空空間中。
在fork實現COW技術后,vfork意義已經不大。
clone用於創建線程,並且參數通過寄存器從用戶空間傳遞下來,通常會指定新的棧地址newsp。借助clone_flags,clone給了用戶更大的選擇空間,他可以是fork/vfork,也可以和父進程共用資源。
kernel_thread用於創建內核線程,CLONE_VM表示和父進程共享內存資源;CLONE_UNTRACED表示線程不能被設置CLONE_PTRACE。
簡單來說fork重,vfork趨淘汰,clone輕,kernel_thread內核。
2.1 do_fork及其參數解釋
do_fork有5個參數:
- clone_flags:創建進程的標志位集合
- stack_start:用戶態棧的起始地址
- stack_size:用戶態棧的大小
- parent_tidptr和child_tidptr:指向用戶空間地址的兩個指針,分別指向父子進程PID。
其中clone_flags是影響do_fork行為的重要參數:
/* * cloning flags: */ #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ #define CLONE_VM 0x00000100 /* set if VM shared between processes */-------------------------父子進程運行在同一個虛擬空間 #define CLONE_FS 0x00000200 /* set if fs info shared between processes */--------------------父子進程共享文件系統信息 #define CLONE_FILES 0x00000400 /* set if open files shared between processes */--------------父子進程共享文件描述符表 #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */-----父子進程共享信號處理函數表 #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */---------父進程被跟蹤ptrace,子進程也會被跟蹤。 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */----在創建子進程時啟動完成機制completion,wait_for_completion()會使父進程進入睡眠等待,知道子進程調用execve()或exit()釋放虛擬內存資源。 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */------------新創建的進程是兄弟關系,而不是父子關系。 #define CLONE_THREAD 0x00010000 /* Same thread group? */ #define CLONE_NEWNS 0x00020000 /* New mount namespace group */------------父子進程不共享mount namespace #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */-- #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ #define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */----------子進程要創建新的User Namespace。 #define CLONE_NEWPID 0x20000000 /* New pid namespace */------------創建一個新的PID namespace。 #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */
主要函數調用路徑如下:
do_fork------------------------------------------ ->copy_process--------------------------------- ->dup_task_struct---------------------------- ->sched_fork--------------------------------- ->copy_files ->copy_fs ->copy_sighand ->copy_signal ->copy_mm------------------------------------ ->dup_mm----------------------------------- ->copy_namespaces ->copy_io ->copy_thread--------------------------------
do_fork()先對CLONE_UNTRACED進行簡單檢查,主要將工作交給copy_process進行處理,最后喚醒創建的進程。
/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { struct task_struct *p; int trace = 0; long nr; /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; struct pid *pid; trace_sched_process_fork(current, p); pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) {------------------對於CLONE_VFORK標志位,初始化vfork完成量 p->vfork_done = &vfork; init_completion(&vfork); get_task_struct(p); } wake_up_new_task(p);------------------------------喚醒新創建的進程p,也即把進程加入調度器里接受調度執行。 /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork))---------等待子進程釋放p->vfork_done完成量 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } put_pid(pid); } else { nr = PTR_ERR(p); } return nr; }
2.2 copy_process
include/linux/sched.h中定義了進程標志位:
/* * Per process flags */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ #define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ #define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */
copy_process借助current獲取當前進程的task_struct數據結構,然后創建新進程數據結構task_struct並復制父進程內容,繼續初始化進程主要部分,比如內存空間、文件句柄、文件系統、IO、等等。
/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval; struct task_struct *p; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))---------------CLONE_FS(父子進程共享文件系統)和CLONE_NEWNS/CLONE_NEWUSER(父子進程不共享mount/user namespace)沖突, return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))--------------------線程組共享信號處理函數 return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))----------------------共享信號處理函數需要共享內存空間 return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE)-----------------------------init是所有用戶空間進程父進程,如果和init兄弟關系,那么進程將無法被回收,從而變成僵屍進程。 return ERR_PTR(-EINVAL); /* * If the new process will be in a different pid or user namespace * do not allow it to share a thread group or signal handlers or * parent with the forking task. */ if (clone_flags & CLONE_SIGHAND) {---------------------------------------------------新的pid或user命名空間和共享信號處理以及線程組沖突,因為他們在namespace中訪問隔離。 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); } retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current);-------------------------------------------------------分配一個task_struct實例,將當前進程current作為母板。 if (!p) goto fork_out; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (nr_threads >= max_threads)----------------------------------------------max_threads是系統允許最多線程個數,nr_threads是系統當前進程個數。 goto bad_fork_cleanup_count; if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);---------------------------------告訴系統不使用超級用戶權限,並且不是workqueue內核線程。 p->flags |= PF_FORKNOEXEC;--------------------------------------------------執行fork但不立即執行 INIT_LIST_HEAD(&p->children);-----------------------------------------------新進程的子進程鏈表 INIT_LIST_HEAD(&p->sibling);------------------------------------------------新進程的兄弟進程鏈表 rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_init(&p->vtime_seqlock); p->vtime_snap = 0; p->vtime_snap_whence = VTIME_SLEEPING; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cpu_timers_init(p); p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_threadgroup_lock; } #endif... #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p);-----------------------------------------初始化進程調度相關數據結構,將進程指定到某一CPU上。 if (retval) goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; retval = copy_files(clone_flags, p);-----------------------------------------復制父進程打開的文件信息 if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p);--------------------------------------------復制父進程fs_struct信息 if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs; retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand; retval = copy_mm(clone_flags, p);--------------------------------------------復制父進程的內存管理相關信息 if (retval) goto bad_fork_cleanup_signal; retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; retval = copy_io(clone_flags, p);--------------------------------------------復制父進程的io_context上下文信息 if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p); if (retval) goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (!pid) goto bad_fork_cleanup_io; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid);-------------------------------------------------------獲取新進程的pid if (clone_flags & CLONE_THREAD) { p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { if (clone_flags & CLONE_PARENT) p->exit_signal = current->group_leader->exit_signal; else p->exit_signal = (clone_flags & CSIGNAL); p->group_leader = p; p->tgid = p->pid; } p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } spin_lock(¤t->sighand->siglock); /* * Copy seccomp details explicitly here, in case they were changed * before holding sighand lock. */ copy_seccomp(p); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++;---------------------------------------------------------當前進程計數遞增 } total_forks++; spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); return p;----------------------------------------------------------------成功返回新進程的task_struct。 ...return ERR_PTR(retval);---------------------------------------------------各種錯誤處理 }
dup_task_struct從父進程復制task_struct和thread_info。
static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; int node = tsk_fork_get_node(orig); int err; tsk = alloc_task_struct_node(node);-------------------------------------------------分配一個task_struct結構體 if (!tsk) return NULL; ti = alloc_thread_info_node(tsk, node);---------------------------------------------分配一個thread_info結構體 if (!ti) goto free_tsk; err = arch_dup_task_struct(tsk, orig);----------------------------------------------將父進程的task_struct拷貝到新進程tsk if (err) goto free_ti; tsk->stack = ti;--------------------------------------------------------------------將新進程的棧指向創建的thread_info。 #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under * the sighand lock in case orig has changed between now and * then. Until then, filter must be NULL to avoid messing up * the usage counts on the error path calling free_task. */ tsk->seccomp.filter = NULL; #endif setup_thread_stack(tsk, orig);------------------------------------------------------將父進程的thread_info復制到子進程thread_info,並將子進程thread_info->task指向子進程 clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); ...return tsk; ...
}
進程相關運行狀態有:
#define TASK_RUNNING 0
#define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 #define __TASK_STOPPED 4 #define __TASK_TRACED 8
sched_fork的主要任務交給__sched_fork(),然后根據優先級選擇調度sched_class類,並執行其task_fork。
最后設置新進程運行的CPU,如果不是當前CPU則需要遷移過來。
/* * fork()/clone()-time setup: */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu();-------------------------------------------------------首先關閉內核搶占,然后獲取當前CPU id。 __sched_fork(clone_flags, p);----------------------------------------------填充sched_entity數據結構,初始化調度相關設置。 /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING;---------------------------------------------------設置為運行狀態,雖然還沒有實際運行。 /* * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio;--------------------------------------------繼承父進程normal_prio作為子進程prio /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: */ p->sched_reset_on_fork = 0; } if (dl_prio(p->prio)) {---------------------------------------------------SCHED_DEADLINE優先級應該是負值,即小於0。 put_cpu(); return -EAGAIN; } else if (rt_prio(p->prio)) {--------------------------------------------SCHED_RT優先級為0-99 p->sched_class = &rt_sched_class; } else {------------------------------------------------------------------SCHED_FAIR優先級為100-139 p->sched_class = &fair_sched_class; } if (p->sched_class->task_fork) p->sched_class->task_fork(p); /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() * is ran before sched_fork(). * * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu);------------------------------------------------------重要一點就是檢查p->stack->cpu是不是當期CPU,如果不是則需要進行遷移。遷移函數使用之前確定的sched_class->migrate_task_rq。 raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) p->on_cpu = 0; #endif init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif put_cpu();-----------------------------------------------------------------再次允許內核搶占。 return 0; }
copy_mm首先設置MM相關參數,然后使用dup_mm來分配mm_struct數據結構,並從父進程復制到新進程mm_struct。
最后將創建的mm_struct復制給task_struct->mm。
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; #endif tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm)-----------------------------------------------如果current->mm為NULL,表示是內核線程。 return 0; /* initialize the new vmacache entries */ vmacache_flush(tsk); if (clone_flags & CLONE_VM) {----------------------------CLONE_VM表示父子進程共享內存空間,依次沒必要新建內存空間,直接使用oldmm。 atomic_inc(&oldmm->mm_users); mm = oldmm; goto good_mm; } retval = -ENOMEM; mm = dup_mm(tsk);---------------------------------------為子進程單獨創建一個新的內存空間mm_struct。 if (!mm) goto fail_nomem; good_mm: tsk->mm = mm;-------------------------------------------對新進程內存空間進行賦值。 tsk->active_mm = mm; return 0; fail_nomem: return retval; }
dup_task從父進程復制mm_struct,然后進行初始化等操作,將完成的mm_struct返回給copy_mm。
/* * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ static struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; mm = allocate_mm();-----------------------------------分配一個mm_struct數據結構 if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm));-----------------------將父進程mm_struct復制到新進程mm_struct。 if (!mm_init(mm, tsk))--------------------------------主要對子進程的mm_struct成員進行初始化,雖然從父進程復制了相關數據,但是對於子進程需要重新進行初始化。 goto fail_nomem; dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm);----------------------------將父進程種所有VMA對應的pte頁表項內容都復制到子進程對應的PTE頁表項中。 if (err) goto free_pt; mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; if (mm->binfmt && !try_module_get(mm->binfmt->module)) goto free_pt; return mm; ... }
對ARM體系結構,Linux內核棧頂存放着ARM通用寄存器struct pt_regs。
struct pt_regs { unsigned long uregs[18]; }; #define ARM_cpsr uregs[16] #define ARM_pc uregs[15] #define ARM_lr uregs[14] #define ARM_sp uregs[13] #define ARM_ip uregs[12] #define ARM_fp uregs[11] #define ARM_r10 uregs[10] #define ARM_r9 uregs[9] #define ARM_r8 uregs[8] #define ARM_r7 uregs[7] #define ARM_r6 uregs[6] #define ARM_r5 uregs[5] #define ARM_r4 uregs[4] #define ARM_r3 uregs[3] #define ARM_r2 uregs[2] #define ARM_r1 uregs[1] #define ARM_r0 uregs[0] #define ARM_ORIG_r0 uregs[17]
關於pt_regs在內核棧的位置,可以看出首先通過task_stack_page(p)站到內核棧起始地址,即底部。
然后加上地址THREAD_START_SP,即THREAD_SIZE兩個頁面8KB減去8字節空洞。
所以childregs指向的位置是棧頂部。
#define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
copy_thread首先獲取棧頂pt_regs位置,然后填充thread_info->cpu_context進程上下文。
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, unsigned long stk_sz, struct task_struct *p) { struct thread_info *thread = task_thread_info(p);--------------------------獲取當前進程的thread_info。 struct pt_regs *childregs = task_pt_regs(p);-------------------------------獲取當前進程的pt_regs memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));----------cpu_context中保存了進程上下文相關的通用寄存器。 if (likely(!(p->flags & PF_KTHREAD))) {------------------------------------內核線程處理 *childregs = *current_pt_regs(); childregs->ARM_r0 = 0; if (stack_start) childregs->ARM_sp = stack_start; } else {-------------------------------------------------------------------普通線程處理,r4等於stk_sz,r5指向start_start。 memset(childregs, 0, sizeof(struct pt_regs)); thread->cpu_context.r4 = stk_sz; thread->cpu_context.r5 = stack_start; childregs->ARM_cpsr = SVC_MODE; } thread->cpu_context.pc = (unsigned long)ret_from_fork;---------------------cpu_context中pc指向ret_from_fork thread->cpu_context.sp = (unsigned long)childregs;-------------------------cpu_context中sp指向新進程的內核棧 clear_ptrace_hw_breakpoint(p); if (clone_flags & CLONE_SETTLS) thread->tp_value[0] = childregs->ARM_r3; thread->tp_value[1] = get_tpuser(); thread_notify(THREAD_NOTIFY_COPY, thread); return 0; }
3. 關於fork()、vfork()、clone()測試
3.1 fork()嵌套打印
3.1.1 代碼
#include <stdio.h> int main(void) { int i; for(i = 0; i<2; i++) { fork(); printf("_%d-%d-%d\n", getppid(), getpid(), i); } wait(NULL); wait(NULL); return 0; }
3.1.2 執行程序,記錄log
執行輸出結果如下:
sudo trace-cmd record -e all ./fork
/sys/kernel/tracing/events/*/filter
Current:4293-i=0
Current:4293-i=1
Current:4294-i=0
Current:4294-i=1
Current:4295-i=1
Current:4296-i=1
相關Trace記錄在trace.dat中。
3.1.3 流程分析
使用kernelshark trace.dat,過濾sched_process_fork/sys_enter_write/sys_enter_wait4后結果如下。
其中sched_process_fork對應fork,sys_enter_write對應printf,sys_enter_wait4對應wait開始,sys_exit_wait4對應wait結束。
下圖是不同進程的流程:
將fork進程關系流程圖畫出如下:
參考文檔:《linux中fork()函數詳解(原創!!實例講解)》
3.2 fork()、vfork()、clone()對比
對於fork()、vfork()、clone()三者的區別,前面已經有介紹,下面通過實例來看他們之間的區別。
3.2.1 fork()和vfork()對比
#include "stdio.h" int main() { int count = 1; int child; printf("Father, initial count = %d, pid = %d\n", count, getpid()); if(!(child = fork())) { int i; for(i = 0; i < 2; i++) { printf("Son, count = %d pid = %d\n", ++count, getpid()); } exit(1); } else {
sleep(1); printf("Father, count = %d pid = %d child = %d\n", count, getpid(), child); } } #include "stdio.h" int main() { int count = 1; int child; printf("Father, initial count = %d, pid = %d\n", count, getpid()); if(!(child = vfork())) { int i; for(i = 0; i < 2; i++) { printf("Son, count = %d pid = %d\n", ++count, getpid()); } exit(1); } else { printf("Father, count = %d pid = %d child = %d\n", count, getpid(), child); } }
fork輸出結果如下:
Father, initial count = 1, pid = 4721
Father, count = 1 pid = 4721 child = 4722
Son, count = 2 pid = 4722
Son, count = 3 pid = 4722
vfork輸出結果如下:
Father, initial count = 1, pid = 4726
Son, count = 2 pid = 4727
Son, count = 3 pid = 4727
Father, count = 3 pid = 4726 child = 4727
將fork代碼加sleep(1);之后結果如下:
Father, initial count = 1, pid = 4858
Son, count = 2 pid = 4859
Son, count = 3 pid = 4859
Father, count = 1 pid = 4858 child = 4859
1. 可以看出vfork父進程在等待子進程結束,然后繼續執行。
2. vfork父子進程之間共享地址空間,父進程的count被子進程修改。
3. fork將父進程打印延時后,可以看出主進程任然打印count=1,說明父子進程空間獨立。
3.2.2 clone不同flag對比
clone的flag決定了clone的行為,比如是否共享空間、是否vfork等
#define _GNU_SOURCE #include "stdio.h" #include "sched.h" #include "signal.h" #define FIBER_STACK 8192 int count; void * stack; int do_something(){ int i; for(i = 0; i < 2; i++) { printf("Son, pid = %d, count = %d\n", getpid(), ++count); } free(stack); //這里我也不清楚,如果這里不釋放,不知道子線程死亡后,該內存是否會釋放,知情者可以告訴下,謝謝 exit(1); } int main() { void * stack; count = 1; stack = malloc(FIBER_STACK);//為子進程申請系統堆棧 if(!stack) { printf("The stack failed\n"); exit(0); } printf("Father, initial count = %d, pid = %d\n", count, getpid()); clone(&do_something, (char *)stack + FIBER_STACK, CLONE_VM|CLONE_VFORK, 0);//創建子線程 printf("Father, pid = %d count = %d\n", getpid(), count); exit(1); }
下面是不同flag組合的輸出結果:
1. CLONE_VM|CLONE_VFORK
父子進程共享內存空間,並且父進程要等待子進程結束。
所以4968在4969結束之后才繼續運行,並且count=3。
Father, initial count = 1, pid = 4968
Son, pid = 4969, count = 2
Son, pid = 4969, count = 3
Father, pid = 4968 count = 3
2. CLONE_VM
父子進程共享內存空間,但是父進程結束時強制子進程退出。
Father, initial count = 1, pid = 5017
Father, pid = 5017 count = 1
將父進程printf前加一個sleep(1),可以看出父進程count=1。
Father, initial count = 1, pid = 5065
Son, pid = 5066, count = 2
Son, pid = 5066, count = 3
Father, pid = 5065 count = 3
3. CLONE_VFORK
這里沒有共享內存空間,但是父進程要等待子進程結束。
所以父進程在子進程后打印,且count=3。
Father, initial count = 1, pid = 4998
Son, pid = 4999, count = 2
Son, pid = 4999, count = 3
Father, pid = 4998 count = 1
4. 0
父子進程不共享內存,但是父進程在結束時繼續等待子進程退出。
這里看不出count是否共享。
Father, initial count = 1, pid = 5174
Father, pid = 5174 count = 1
Son, pid = 5175, count = 2
Son, pid = 5175, count = 3
在父進程printf之前加sleep(1),結果如下:
和預期一樣,主進程count是單獨一份,而沒有和子進程共用。
Father, initial count = 1, pid = 5257
Son, pid = 5258, count = 2
Son, pid = 5258, count = 3
Father, pid = 5257 count = 1
參考文檔:linux系統調用fork, vfork, clone