本文以Linux3.14版本源碼為例分析其啟動流程。各版本啟動代碼略有不同,但核心流程與思想萬變不離其宗。
內核映像被加載到內存並獲得控制權之后,內核啟動流程開始。通常,內核映像以壓縮形式存儲,並不是一個可以執行的內核。因此,內核階段的首要工作是自解壓內核映像。
內核編譯生成vmliunx后,通常會對其進行壓縮,得到zImage(小內核,小於512KB)或bzImage(大內核,大於512KB)。在它們的頭部嵌有解壓縮程序。
通過linux/arch/arm/boot/compressed目錄下的Makefile尋找到vmlinux文件的鏈接腳本(vmlinux.lds),從中查找系統啟動入口函數。
$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/$(HEAD) $(obj)/piggy.$(suffix_y).o \
$(addprefix $(obj)/, $(OBJS)) $(lib1funcs) $(ashldi3) \
$(bswapsdi2) FORCE
@$(check_for_multiple_zreladdr)
$(call if_changed,ld)
@$(check_for_bad_syms)
vmlinux.lds(linux/arch/arm/kernel/vmlinux.lds)鏈接腳本開頭內容
OUTPUT_ARCH(arm)
ENTRY(stext)
jiffies = jiffies_64;
SECTIONS
{
。
。
。
得到內核入口函數為 stext(linux/arch/arm/kernel/head.S)
內核引導階段
ENTRY(stext)
。
。
。
bl __lookup_processor_type @ r5=procinfo r9=cpuid //處理器是否支持
movs r10, r5 @ invalid processor (r5=0)?
THUMB( it eq ) @ force fixup-able long branch encoding
beq __error_p @ yes, error 'p' //不支持則打印錯誤信息
。
。
。
bl __create_page_tables //創建頁表
/*
* The following calls CPU specific code in a position independent
* manner. See arch/arm/mm/proc-*.S for details. r10 = base of
* xxx_proc_info structure selected by __lookup_processor_type
* above. On return, the CPU will be ready for the MMU to be
* turned on, and r0 will hold the CPU control register value.
*/
ldr r13, =__mmap_switched @ address to jump to after //保存MMU使能后跳轉地址
@ mmu has been enabled
adr lr, BSYM(1f) @ return (PIC) address
mov r8, r4 @ set TTBR1 to swapper_pg_dir
ARM( add pc, r10, #PROCINFO_INITFUNC )
THUMB( add r12, r10, #PROCINFO_INITFUNC )
THUMB( mov pc, r12 )
1: b __enable_mmu //使能MMU后跳轉到__mmap_switched
查找標簽__mmap_switched所在位置:/linux/arch/arm/kernel/head-common.S
__mmap_switched:
/*
* The following fragment of code is executed with the MMU on in MMU mode,
* and uses absolute addresses; this is not position independent.
*
* r0 = cp#15 control register
* r1 = machine ID
* r2 = atags/dtb pointer
* r9 = processor ID
*/
//保存設備信息、設備樹及啟動參數存儲地址
。
。
。
b start_kernel
內核初始化階段
從start_kernel函數開始,內核進入C語言部分,完成內核的大部分初始化工作。
函數所在位置:/linux/init/Main.c
start_kernel涉及大量初始化工作,只例舉重要的初始化工作。
asmlinkage void __init start_kernel(void)
{
…… //類型判斷
smp_setup_processor_id(); //smp相關,返回啟動CPU號
……
local_irq_disable(); //關閉當前CPU中斷
early_boot_irqs_disabled = true;
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
boot_cpu_init();
page_address_init(); //初始化頁地址
pr_notice("%s", linux_banner); //顯示內核版本信息
setup_arch(&command_line);
mm_init_owner(&init_mm, &init_task);
mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
build_all_zonelists(NULL, NULL);
page_alloc_init(); //頁內存申請初始化
pr_notice("Kernel command line: %s\n", boot_command_line); //打印內核啟動命令行參數
parse_early_param();
parse_args("Booting kernel", static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, &unknown_bootoption);
……
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init(); //進程調度器初始化
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable(); //禁止內核搶占
if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable(); //檢查關閉CPU中斷
/*大量初始化內容 見名知意*/
idr_init_cache();
rcu_init();
tick_nohz_init();
context_tracking_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
tick_init();
init_timers();
hrtimers_init();
softirq_init();
timekeeping_init();
time_init();
sched_clock_postinit();
perf_event_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
local_irq_enable(); //本地中斷可以使用了
kmem_cache_init_late();
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init(); //初始化控制台,可以使用printk了
if (panic_later)
panic("Too many boot %s vars at `%s'", panic_later,
panic_param);
lockdep_info();
/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
page_to_pfn(virt_to_page((void *)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
page_cgroup_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
late_time_init();
sched_clock_init();
calibrate_delay();
pidmap_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
#ifdef CONFIG_X86_ESPFIX64
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
#endif
thread_info_cache_init();
cred_init();
fork_init(totalram_pages); //初始化fork
proc_caches_init();
buffer_init();
key_init();
security_init();
dbg_late_init();
vfs_caches_init(totalram_pages); //虛擬文件系統初始化
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
cgroup_init();
cpuset_init();
taskstats_init_early();
delayacct_init();
check_bugs();
sfi_init_late();
if (efi_enabled(EFI_RUNTIME_SERVICES)) {
efi_late_init();
efi_free_boot_services();
}
ftrace_init();
/* Do the rest non-__init'ed, we're now alive */
rest_init();
}
函數最后調用rest_init()函數
/*最重要使命:創建kernel_init進程,並進行后續初始化*/
static noinline void __init_refok rest_init(void)
{
int pid;
rcu_scheduler_starting();
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); //創建kernel_init進程
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
complete(&kthreadd_done);
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
init_idle_bootup_task(current);
schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */
//cpu_idle就是在系統閑置時用來降低電力的使用和減少熱的產生的空轉函數,函數至此不再返回,其余工作從kernel_init進程處發起
cpu_startup_entry(CPUHP_ONLINE);
}
kernel_init函數將完成設備驅動程序的初始化,並調用init_post函數啟動用戶進程
部分書籍介紹的內核啟動流程基於經典的2.6版本,kernel_init函數還會調用init_post函數專門負責_init進程的啟動,現版本已經被整合到了一起。
static int __ref kernel_init(void *unused)
{
int ret;
kernel_init_freeable(); //該函數中完成smp開啟 驅動初始化 共享內存初始化等工作
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
free_initmem(); //初始化尾聲,清除內存無用數據
mark_rodata_ro();
system_state = SYSTEM_RUNNING;
numa_default_policy();
flush_delayed_fput();
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*尋找init函數,創建一號進程_init (第一個用戶空間進程)*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d). Attempting defaults...\n",
execute_command, ret);
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/init.txt for guidance.");
}
static int __ref kernel_init(void *unused)
{
int ret;
kernel_init_freeable(); //該函數中完成smp開啟 驅動初始化 共享內存初始化等工作
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
free_initmem(); //初始化尾聲,清除內存無用數據
mark_rodata_ro();
system_state = SYSTEM_RUNNING;
numa_default_policy();
flush_delayed_fput();
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*尋找init函數,創建一號進程_init (第一個用戶空間進程)*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d). Attempting defaults...\n",
execute_command, ret);
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/init.txt for guidance.");
}
到此,內核初始化已經接近尾聲,所有的初始化函數都已經調用,因此free_initmem函數可以舍棄內存的__init_begin至__init_end之間的數據。
當內核被引導並進行初始化后,內核啟動了自己的第一個用戶空間應用程序_init,這是調用的第一個使用標准C庫編譯的程序,其進程編號時鍾為1.
_init負責出發其他必須的進程,以使系統進入整體可用的狀態。
以下為內核啟動流程圖:
start_kernel()--->setup_arch()--->do_initcalls()--->customize_machine()--->mini6410_machine_init()
原文鏈接:https://blog.csdn.net/perfect1t/article/details/81741531