關鍵詞:kthread、irq、ksoftirqd、kworker、workqueues
在使用ps查看線程的時候,會有不少[...]名稱的線程,這些有別於其它線程,都是內核線程。
其中多數內核線程從名稱看,就知道其主要功能。
比如給中斷線程化使用的irq內核線程,軟中斷使用的內核線程ksoftirqd,以及work使用的kworker內核線程。
本文首先概覽一下Linux都有哪些內核線程,然后分析創建內核線程的API。
在介紹內核線程和普通線程都有哪些區別?
最后介紹主要內核線程(irq/ksoftirqd/kworker/)的創建過程及其作用。
1. ps下初步認識Linux內核線程
在ps -a會顯示如下,可以看出內核線程都用[...]標注。
並且pid=1的init進程是所有用戶空間進程的父進程;pid=2的kthreadd內核線程是所有內核線程的父線程。
內核線程分為幾大類:softirq、kworker、irq及其他。
PID USER TIME COMMAND 1 0 0:01 {linuxrc} init 2 0 0:00 [kthreadd] 3 0 0:00 [ksoftirqd/0] 4 0 0:00 [kworker/0:0] 5 0 0:00 [kworker/0:0H] 6 0 0:00 [kworker/u8:0] 7 0 0:00 [rcu_sched] 8 0 0:00 [rcu_bh] 9 0 0:00 [migration/0] 10 0 0:00 [migration/1] 11 0 0:00 [ksoftirqd/1] 12 0 0:00 [kworker/1:0] 13 0 0:00 [kworker/1:0H] 14 0 0:00 [migration/2] 15 0 0:00 [ksoftirqd/2] 16 0 0:00 [kworker/2:0] 17 0 0:00 [kworker/2:0H] 18 0 0:00 [migration/3] 19 0 0:00 [ksoftirqd/3] 20 0 0:00 [kworker/3:0] 21 0 0:00 [kworker/3:0H] 22 0 0:00 [khelper] 23 0 0:00 [kdevtmpfs] 24 0 0:00 [perf] 25 0 0:00 [kworker/u8:1] 279 0 0:00 [khungtaskd] 280 0 0:00 [writeback] 281 0 0:00 [kintegrityd] 282 0 0:00 [kworker/0:1] 284 0 0:00 [bioset] 286 0 0:00 [kblockd] 294 0 0:00 [ata_sff] 408 0 0:00 [rpciod] 409 0 0:00 [kworker/2:1] 410 0 0:00 [kworker/1:1] 412 0 0:00 [kswapd0] 416 0 0:00 [fsnotify_mark] 429 0 0:00 [nfsiod] 449 0 0:00 [kworker/3:1] 527 0 0:00 [kpsmoused] 537 0 0:00 [kworker/1:2] 613 0 0:00 [deferwq]
2. kthreadd以及創建內核線程API
2.1 kthreadd:kthreadd內核線程的創建
內核其他線程的創立,要基於kthreadd。kthreadd線程是其他線程的父線程。
start_kernel-->rest_init如下:
static noinline void __init_refok rest_init(void) { int pid; rcu_scheduler_starting(); /* * We need to spawn init first so that it obtains pid 1, however * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ kernel_thread(kernel_init, NULL, CLONE_FS);--------------------------------創建第一個用戶空間線程init numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);---------------創建第一個內核線程kthreadd rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);--------------------kthreadd_task指向kthreadd的task_strcut結構體 rcu_read_unlock(); complete(&kthreadd_done);--------------------------------------------------在init進程kernel_init-->kernel_init_freeable中等待kthreadd_done釋放 /* * The boot idle thread must execute schedule() * at least once to get things moving: */ init_idle_bootup_task(current); schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ cpu_startup_entry(CPUHP_ONLINE); }
kernel_init在kthreadd之前啟動,但是kernel_init的很多任務需要基於kthreadd。所以在kernel_init的開頭等待reset_init的kthreadd_done完成量。
因為kernel_init-->kernel_init_freeable-->do_basic_setup-->do_initcalls中很多初始化需要kthread_create支援。
kernel_init-->kernel_init_freeable: static noinline void __init kernel_init_freeable(void) { /* * Wait until kthreadd is all set-up. */ wait_for_completion(&kthreadd_done);-------------------等待kthreadd_done完成量 ...
do_basic_setup();---------------------------------------很多初始化需要kthread_create支持
...
}
內核中有一個線程kthreadd_task負責創建其他內核線程,這個線程的函數為kthreadd()。
int kthreadd(void *unused) { struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule();----------------------------------------------如果kthread_create_list為空,讓出CPU,進入休眠狀態。在kthread_create_on_node()中會將要創建進程節點加入到kthread_create_list中,然后喚醒此進程。 __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) {------------------只要kthread_create_list不為空,遍歷kthread_create_list鏈表 struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list);----------------------------從kthread_create_list中摘除當前create spin_unlock(&kthread_create_lock); create_kthread(create);----------------------------------創建線程 spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);----調用do_fork()創建線程 if (pid < 0) { /* If user was SIGKILLed, I release the structure. */ struct completion *done = xchg(&create->done, NULL); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done);--------------------------------------------------------觸發complete事件 } } pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); }
2.2 創建內核線程接口:kthread_create等
kthread_create()是最常見的創建內核線程的接口。
kthread_create_on_cpu()相對於kthread_create多了個cpu,但都基於kthread_create_on_node()。
kthread_run基於kthreadd_create,所以這些函數都基於kthread_create_on_node。
#define kthread_create(threadfn, data, namefmt, arg...) \ kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt); /** * kthread_run - create and wake a thread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @namefmt: printf-style name for the thread. * * Description: Convenient wrapper for kthread_create() followed by * wake_up_process(). Returns the kthread or ERR_PTR(-ENOMEM). */ #define kthread_run(threadfn, data, namefmt, ...) \ ({ \ struct task_struct *__k \ = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \ if (!IS_ERR(__k)) \--------------------------如果kthread_create()正確創建了一個進程,調用wake_up_process()喚醒它。 wake_up_process(__k); \ __k; \ })
kthread_create_on_node()負責創建一個線程,填充一個kthread_create_info結構體;然后將此結構體作為一個節點插入kthread_create_list隊尾。
然后喚醒kthreadd_task進行處理,創建線程。
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL);---------------------------------創建插入kthread_create_list的節點。 if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list);-------------------將填充的節點插入kthread_create_list中。 spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task);---------------------------------------喚醒kthread_task處理kthread_create_list鏈表,創建相應的線程。 /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) {------------------等待complete事件觸發,在create_kthread()中觸發。 /* * If I was SIGKILLed before kthreadd (or new kernel thread) * calls complete(), leave the cleanup of this structure to * that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done);---------------------------------------等待complete事件觸發。 } task = create->result;------------------------------------------------創建的結果為task_struct結構體。 if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); vsnprintf(task->comm, sizeof(task->comm), namefmt, args);---------配置進程名稱。 va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m);-----------設置進程調度策略為NORMAL,優先級為0。 set_cpus_allowed_ptr(task, cpu_all_mask); } kfree(create);--------------------------------------------------------釋放kthread_create_info。 return task; }
3. 內核線程和普通線程的區別
內核線程沒有地址空間,所以task_struct->mm指針為NULL。內核線程沒有用戶上下文。
內核線程只工作在內核空間,不會切換至用戶空間。但內核線程同樣是可調度且可搶占的。
普通線程即可工作在內核空間,也可工作在用戶空間。
內核線程只能訪問3GB以上地址,而普通線程可訪問所有4GB地址空間。
4. irq、softirq、woker內核線程
irq、softirq、worker都可能創建對應的內核線程,有線程就有優先級。
下面從優先來來看看它們的重要性。
可以看出中斷內核線程優先級很高,為49,並且使用了實時調度策略。softirq和worker都是普通內核線程。
prio | policy | |
irq | 49 | SCHED_FIFO |
softirq | 120 | SCHED_NORMAL |
worker | 120 | SCHED_NORMAL |
init | 120 | SCHED_NORMAL |
kthreadd | 120 | SCHED_NORMAL |
cfinteractive | 0 | SCHED_FIFO |
其它特殊內核線程init優先級為120,kthreadd優先級為120.
cfinteractive優先級最高,主要處理CPU Frequency負載更新。
4.1 irq/xx-xx:創建處理線程化中斷的線程
request_threaded_irq-->__setup_irq,可見如果設置了thread_fn,並且不允許中斷嵌套,則創建一個類似"irq/中斷號-終端名稱"的線程。
線程函數是irq_thread,
/* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { ... if (new->thread_fn && !nested) { struct task_struct *t; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; t = kthread_create(irq_thread, new, "irq/%d-%s", irq,----------------在irq_thread中調用irq_thread_fn,進而調用action->thread_fn,request_threaded_irq參數thread_fn。 new->name); ... } ... }
request_irq是對request_threaded_irq的封裝,創建中斷線程的工作交給__setup_irq()。
static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { return request_threaded_irq(irq, handler, NULL, flags, name, dev); }
更詳細信息參考:《Linux中斷管理 (1)Linux中斷管理機制》中關於request_irq()介紹。
4.2 ksoftirqd/xx:創建處理軟中斷線程
軟中斷線程通過smpboot_register_percpu_thread注冊softirq_threads創建。
static struct smp_hotplug_thread softirq_threads = { .store = &ksoftirqd, .thread_should_run = ksoftirqd_should_run, .thread_fn = run_ksoftirqd, .thread_comm = "ksoftirqd/%u", }; static __init int spawn_ksoftirqd(void) { register_cpu_notifier(&cpu_nfb); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); return 0; }
smpboot_register_percpu_thread-->__smpboot_create_thread,最終也還是調用kthread_create_on_cpu,創建了類似"ksoftirqd/xx"的內核線程,xx為cpuid號。
從ps -a中可以看出創建的結果如下,可以看出每個CPU創建了一個ksoftirqd內核線程。
3 0 0:03 [ksoftirqd/0] 11 0 0:03 [ksoftirqd/1] 15 0 0:00 [ksoftirqd/2] 19 0 0:00 [ksoftirqd/3]
更詳細信息參考: 《Linux中斷管理 (2)軟中斷和tasklet》
4.3 kworker:創建work的工作線程
kwoker線程是處理work的工作線程,詳細參考《Linux中斷管理 (3)workqueue工作隊列》。
每個CPU都會創建自己的workqueue,用以集中處理內核kworker。
workquuue就是把一些任務(work)推遲到一個或一組內核線程中去執行,那個內核線程被稱為worker_thread。
首先看看創建結果,可以看出在init_workqueues中創建了綁定CPU0的兩個kworker,分別是nice=0和nice=-20。
apply_workqueue_attrs創建unbund worker,即kworker/u8:0。
然后在每個CPU_UP_PREPARE回調中創建兩個不同nice的kworker。所以四個CPU一共9個內核線程。
PID USER TIME COMMAND 1 0 0:01 {linuxrc} init 2 0 0:00 [kthreadd] 3 0 0:00 [ksoftirqd/0] 4 0 0:00 [kworker/0:0] 5 0 0:00 [kworker/0:0H]---------------init_workqueues-->create_worker 6 0 0:00 [kworker/u8:0]---------------apply_workqueue_attrs-->alloc_unbound_pwq-->create_worker 7 0 0:00 [rcu_sched] 8 0 0:00 [rcu_bh] 9 0 0:00 [migration/0] 10 0 0:00 [migration/1] 11 0 0:00 [ksoftirqd/1] 12 0 0:00 [kworker/1:0]---------------workqueue_cpu_up_callback-->create_worker 13 0 0:00 [kworker/1:0H] 14 0 0:00 [migration/2] 15 0 0:00 [ksoftirqd/2] 16 0 0:00 [kworker/2:0] 17 0 0:00 [kworker/2:0H]--------------workqueue_cpu_up_callback-->create_worker 18 0 0:00 [migration/3] 19 0 0:00 [ksoftirqd/3] 20 0 0:00 [kworker/3:0] 21 0 0:00 [kworker/3:0H]--------------workqueue_cpu_up_callback-->create_worker 22 0 0:00 [khelper] 23 0 0:00 [kdevtmpfs] 24 0 0:00 [perf] 25 0 0:00 [kworker/u8:1]--------------worker_thread-->create_worker 279 0 0:00 [khungtaskd] 280 0 0:00 [writeback] 281 0 0:00 [kintegrityd] 282 0 0:00 [kworker/0:1]---------------worker_thread-->create_worker 284 0 0:00 [bioset] 286 0 0:00 [kblockd] 294 0 0:00 [ata_sff] 408 0 0:00 [rpciod] 409 0 0:00 [kworker/2:1]---------------worker_thread-->create_worker 410 0 0:00 [kworker/1:1]---------------worker_thread-->create_worker 412 0 0:00 [kswapd0] 416 0 0:00 [fsnotify_mark] 429 0 0:00 [nfsiod] 449 0 0:00 [kworker/3:1]---------------worker_thread-->create_worker 527 0 0:00 [kpsmoused] 537 0 0:00 [kworker/1:2]---------------worker_thread-->create_worker 613 0 0:00 [deferwq]
init_workqueues-->create_worker-->kthread_create_on_node,創建"kworker/xx:xxH"內核線程。
static int __init init_workqueues(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; ... /* create the initial worker */ for_each_online_cpu(cpu) {---------------------------------遍歷CPU[0~3] struct worker_pool *pool; for_each_cpu_worker_pool(pool, cpu) {------------------NR_STD_WORKER_POOLS=2,所以每個CPU有兩個pool pool->flags &= ~POOL_DISASSOCIATED; BUG_ON(!create_worker(pool)); } } ... system_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", WQ_POWER_EFFICIENT, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); return 0; }
create_worker()函數創建工作線程。
static struct worker *create_worker(struct worker_pool *pool) { ... if (pool->cpu >= 0) snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,-------------cpuid和id,區分cpu和cpu內kworker。 pool->attrs->nice < 0 ? "H" : ""); else snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);--------------u表示不指定cpu。 worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); ... }
更詳細信息參考:《Linux中斷管理 (3)workqueue工作隊列》、《Linux workqueue工作原理》、《Concurrency Managed Workqueue之(一):workqueue的基本概念》
5. 其他內核線程
rcu_sched、rcu_bh
migration
khelper
kdevtmpfs
perf
writeback
kintegrityd
bioset
kblockd
ata_sff
rpciod
kswapd
nfsiod
kpsmpused
deferwq