基於Arm64 linux-5.10
一、主要實現文件
1. preempt_count的位置
//arch/arm64/include/asm/preempt.h static inline int preempt_count(void) { return READ_ONCE(current_thread_info()->preempt.count); //(struct thread_info *)current->preempt.count }
注意:這是一個 per-task 的字段。
2. 相關實現
include/linux/preempt.h 僅保留可搶占和使用的部分:
//include/linux/preempt.h 僅保留可搶占和使用的部分 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PREEMPT_H #define __LINUX_PREEMPT_H /* * include/linux/preempt.h - macros for accessing and manipulating * preempt_count (used for kernel preemption, interrupt count, etc.) */ #include <linux/linkage.h> #include <linux/list.h> /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 //bit31 int型最高bit,但<asm/preempt.h>中定義是BIT(32) !! */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 #define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) //8 #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) //16 #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) //20 #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) //bit0-bit7 0x000000ff #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) //bit8-bit15 0x0000ff00 #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) //bit16-bit19 0x000f0000 #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) //bit20-bit23 0x00f00000 #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) //1<<0 #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) //1<<8 #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) //1<<16 #define NMI_OFFSET (1UL << NMI_SHIFT) //1<<20 #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) //2*(1<<8) == 1<<9 #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) //1 + 1<<32 /* * Disable preemption until the scheduler is running -- use an unconditional * value so that it also works on !PREEMPT_COUNT kernels. * * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET //0 /* * Initial preempt_count value; reflects the preempt_count schedule invariant * which states that during context switches: * * preempt_count() == 2*PREEMPT_DISABLE_OFFSET * * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. * Note: See finish_task_switch(). */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) //2 + 1<<32 /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK)) /* * Are we doing bottom half or hardware interrupt processing? * * in_irq() - We're in (hard) IRQ context * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled * in_serving_softirq() - We're in softirq context * in_nmi() - We're in NMI context * in_task() - We're in task context * * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really should not be used in new code. */ //各種上下文 #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) #define in_nmi() (preempt_count() & NMI_MASK) #define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) /* * The preempt_count offset after preempt_disable(); */ # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET //1 /* * The preempt_count offset after spin_lock() */ #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET /* * The preempt_count offset needed for things like: * * spin_lock_bh() * * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and * softirqs, such that unlock sequences of: * * spin_unlock(); * local_bh_enable(); * * Work as expected. */ #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about * held spinlocks in non-preemptible kernels. Thus it should not be * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ //判斷是否在原子上下文 #define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() #define __preempt_count_inc() __preempt_count_add(1) #define __preempt_count_dec() __preempt_count_sub(1) #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_notrace(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched(0)) \ __preempt_schedule(); \ } while (0) #define preempt_disable_notrace() \ do { \ __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #define preempt_set_need_resched() \ do { \ set_preempt_need_resched(); \ } while (0) #define preempt_fold_need_resched() \ do { \ if (tif_need_resched()) \ set_preempt_need_resched(); \ } while (0) struct preempt_notifier; /** * preempt_ops - notifiers called when a task is preempted and rescheduled * @sched_in: we're about to be rescheduled: * notifier: struct preempt_notifier for the task being scheduled * cpu: cpu we're scheduled on * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out * * Please note that sched_in and out are called under different * contexts. sched_out is called with rq lock held and irq disabled * while sched_in is called without rq lock and irq enabled. This * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next); }; /** * preempt_notifier - key for installing preemption notifiers * @link: internal use * @ops: defines the notifier functions to be called * * Usually used in conjunction with container_of(). */ struct preempt_notifier { struct hlist_node link; struct preempt_ops *ops; }; void preempt_notifier_inc(void); void preempt_notifier_dec(void); /* * tell me when current is being preempted & rescheduled. * sched/core.c中實現,kvm/kvm_main.c 中注冊 */ void preempt_notifier_register(struct preempt_notifier *notifier); /* * no longer interested in preemption notifications. * sched/core.c中實現,kvm/kvm_main.c 中注冊 */ void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { INIT_HLIST_NODE(¬ifier->link); notifier->ops = ops; } /** * migrate_disable - Prevent migration of the current task * * Maps to preempt_disable() which also disables preemption. Use * migrate_disable() to annotate that the intent is to prevent migration, * but not necessarily preemption. * * Can be invoked nested like preempt_disable() and needs the corresponding * number of migrate_enable() invocations. */ static __always_inline void migrate_disable(void) { preempt_disable(); } /** * migrate_enable - Allow migration of the current task * * Counterpart to migrate_disable(). * * As migrate_disable() can be invoked nested, only the outermost invocation * reenables migration. * * Currently mapped to preempt_enable(). */ static __always_inline void migrate_enable(void) { preempt_enable(); } #endif /* __LINUX_PREEMPT_H */
二、preempt_count 各位段說明
1. 這個 per-任務的 counter 可以用來指示當前線程的狀態、它是否可以被搶占,以及它是否被允睡眠。
preempt_count 這個成員被用來判斷當前進程是否可以被搶占。如果 preempt_count 不等於0(可能是代碼調用preempt_disable顯式的禁止了搶占,也可能是處於中斷上下文等),說明當前不能進行搶占,如果 preempt_count 等於0,說明已經具備了搶占的條件(當然具體是否要搶占當前進程還是要看當前進程的 thread info 中的 flag 成員是否設定了 _TIF_NEED_RESCHED 這個標記,可能是當前的進程的時間片用完了,也可能是由於中斷喚醒了優先級更高的進程)。
2. Preemption-disable Count 位段
占8bit,用來記錄當前進程被顯式的禁止搶占的嵌套的次數。也就是說,每調用一次 preempt_disable()就會加1,調用 preempt_enable(),就會減1。preempt_disable() 和 preempt_enable() 必須成對出現,可以嵌套,最大嵌套的深度是255,因為只有8bit。
3. Software interrupt count
占8bit,用來記錄當前正在運行進程被軟中斷打斷嵌套的次數。對此位段進行操作有兩個場景:
(1) 也是在進入soft irq handler之前給此位段加1,退出soft irq handler之后給此位段減1。由於soft irq handler在一個CPU上是不會並發的,總是串行執行,因此,這個場景下只需要一個bit就夠了,也就是上圖中的bit8。通過該bit可以知道當前task是否在sofirq context。
(2) 由於內核同步的需求,進程上下文需要禁止 softirq。這時候,kernel提供了 local_bh_enable()和 local_bh_disable()這樣的接口函數。這部分的概念是和preempt disable/enable 類似的,占用了bit9--bit15,最大可以支持127次嵌套。
注:local_bh_disable()中還有對 preempt_count 加上16,local_bh_enable()中減去16,目前還不明白為什么要這樣做?
4. Hardware interrupt count 位段
占4bit,用來記錄當前正在運行進程被硬中斷打斷嵌套的次數,用來描述當前中斷handler嵌套的深度。對於ARM64平台的kernel-5.10,其中斷部分的代碼如下:
//kernel/irq/irqdesc.c int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, bool lookup, struct pt_regs *regs) { ... struct irq_desc *desc; irq_enter(); generic_handle_irq_desc(desc); //desc->handle_irq(desc) irq_exit(); ... }
通用的IRQ handler被 irq_enter()和 irq_exit()這兩個函數包圍。irq_enter()說明進入到IRQ context,而 irq_exit()則說明退出IRQ context。在irq_enter()函數中會調用 preempt_count_add(HARDIRQ_OFFSET),為"Hardware interrupt count"的bit field增加1。在 irq_exit()函數中,會調用 preempt_count_sub(HARDIRQ_OFFSET) 減去1。占用了4bit說明硬件中斷handler最大可以嵌套15層。
在舊的內核中,占12bit,支持4096個嵌套。當然,在舊的kernel中還區分fast interrupt handler和slow interrupt handler,中斷handler最大可以嵌套的次數理論上等於系統IRQ的個數。在實際中,這個數目不可能那么大(內核棧就受不了),因此,即使系統支持了非常大的中斷個數,也不可能各個中斷依次嵌套,達到理論的上限。基於這樣的考慮,后來內核減少為10bit(在general arch的代碼中修改為10,實際上,各個arch可以redefine自己的hardirq count的bit數)。但是,當內核大佬們決定廢棄slow interrupt handler的時候,實際上,中斷的嵌套已經不會發生了。因此,理論上,hardirq count要么是0,要么是1。不過呢,不能總拿理論說事,實際上,萬一有寫奇葩或者老古董driver在handler中打開中斷,那么這時候中斷嵌套還是會發生的,但是,應該不會太多,因此,目前占用4bit,應付15個奇葩driver是妥妥的。
5. Reschedule needed位段
最高 bit31 這個"reschedule needed"位告訴內核,當前有一個優先級較高的進程應該在第一時間獲得CPU。必須要在 preempt_count 為非零值的情況下,才會設置這個bit,否則的話內核早就可以直接對這個任務進行搶占,而沒必要設置此bit並等待。
三、task的各種上下文
1. 各種context在 include/linux/preempt.h 中的定義:
/* * Are we doing bottom half or hardware interrupt processing? * * in_irq() - We're in (hard) IRQ context * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled * in_serving_softirq() - We're in softirq context * in_nmi() - We're in NMI context * in_task() - We're in task context * * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really should not be used in new code. */ #define in_irq() (hardirq_count()) //preempt_count() & HARDIRQ_MASK #define in_softirq() (softirq_count()) //preempt_count() & SOFTIRQ_MASK #define in_interrupt() (irq_count()) //preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) //preempt_count() & SOFTIRQ_OFFSET 只使用最低bit #define in_nmi() (preempt_count() & NMI_MASK) #define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
2. irq context 其實就是 hard irq context,也就是說明當前正在執行中斷handler(top half),只要 preempt_count 中的 hardirq count 大於0(=1是沒有中斷嵌套,如果大於1,說明有中斷嵌套),那么就是IRQ context。
3. softirq context 並沒有那么的直接,一般人會認為當 sofirq handler 正在執行的時候就是 softirq context。這樣說當然沒有錯,sofirq handler 正在執行的時候,會增加"Software interrupt count",當然是softirq context。不過,在其他context的情況下,例如進程上下文中,有可能因為同步的要求而調用local_bh_disable(),這時候,通過 local_bh_disable()/local_bh_enable() 保護起來的代碼也是執行在 softirq context 中。當然,這時候其實並沒有正在執行softirq handler。如果你確實想知道當前是否正在執行 softirq handler,可以使用 in_serving_softirq()來完成這個使命,這是通過操作 preempt_count 的bit8來完成的。
4. 所謂中斷上下文,由 in_interrupt() 來表示,就是 IRQ context + softirq context + NMI context。
5. 進程上下文由 in_task() 表示,local_bh_disable()/local_bh_enable()保護的區間,仍然屬於進程上下文,感覺與in_softirq()有沖突,此時就既屬於軟中斷上下文又屬於進程上下文了。
四、總結
1. 只要看一下 preempt_count() 的值,內核就可知道當前的情況如何,比如 preempt_count() 值是非零值,就表示當前線程不能被 scheduler 搶占,因為此時要么是搶占已經被明確被禁止了,要么是CPU當前正在處理某種中斷。同理,非零值也表示當前線程不能睡眠(待確認!)。
2. preempt_disable()只適用於線程在kernel里運行的情況,而用戶空間的代碼則總是可以被搶占的。
3. 一個問題:如果配置為非搶占式內核,內核代碼不能被搶占,那么就沒有必要跟蹤記錄 preempt_disable()了,因為搶占是永遠被關閉,不用浪費時間去維護這些信息,因此此時 preempt_count 的 preempt-disable 這幾個 bit 總是為0,preemptible() 函數將總是返回 false。非搶占式內核中,在某些情況下,例如當 spin_lock 被持有時(這種情況確實是 atomic context),in_atomic()卻會由於這個 preempt_count()==0 而返回 false。
4. might_sleep(): 指示當前函數可以睡眠。如果它所在的函數處於原子上下文(atomic context)中,將打印出堆棧的回溯信息。這個函數主要用來做調試工作,在你不確定不期望睡眠的地方是否真的不會睡眠時,就把這個宏加進去。對於內核Release版本,一般沒有使能 CONFIG_DEBUG_ATOMIC_SLEEP,might_sleep() 就是一個空函數。
5. spin_lock()/spin_unlock()保護的區間不能休眠的原因
static __always_inline void spin_lock(spinlock_t *lock) { do { preempt_disable(); ___LOCK(&lock->rlock); } while (0); }
spin_lock 在獲取不到鎖的時候就會自旋等待。在嘗試獲取鎖之前會先關搶占,若是一個進程A在獲取到鎖之后休眠了,此時調度器將調度其它任務B在當前CPU上運行,若任務B也獲取這個鎖,若是會獲取不到從而進入自旋狀態,此時又是關閉搶占的,此CPU始終無法調度其它任務到當前CPU上運行,則當前CPU則會死鎖在任務B上。