調度器17—preempt_count和各種上下文

本文轉載自查看原文 2021-12-06 22:46 1130 進程管理

基於Arm64 linux-5.10

一、主要實現文件

1. preempt_count的位置

//arch/arm64/include/asm/preempt.h
static inline int preempt_count(void)
{
    return READ_ONCE(current_thread_info()->preempt.count); //(struct thread_info *)current->preempt.count
}

注意：這是一個 per-task 的字段。

2. 相關實現

include/linux/preempt.h 僅保留可搶占和使用的部分：

//include/linux/preempt.h 僅保留可搶占和使用的部分

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PREEMPT_H
#define __LINUX_PREEMPT_H

/*
 * include/linux/preempt.h - macros for accessing and manipulating
 * preempt_count (used for kernel preemption, interrupt count, etc.)
 */

#include <linux/linkage.h>
#include <linux/list.h>

/*
 * We put the hardirq and softirq counter into the preemption
 * counter. The bitmask has the following meaning:
 *
 * - bits 0-7 are the preemption count (max preemption depth: 256)
 * - bits 8-15 are the softirq count (max # of softirqs: 256)
 *
 * The hardirq count could in theory be the same as the number of
 * interrupts in the system, but we run all interrupt handlers with
 * interrupts disabled, so we cannot have nesting interrupts. Though
 * there are a few palaeontologic drivers which reenable interrupts in
 * the handler, so we need more than one bit here.
 *
 *         PREEMPT_MASK:    0x000000ff
 *         SOFTIRQ_MASK:    0x0000ff00
 *         HARDIRQ_MASK:    0x000f0000
 *             NMI_MASK:    0x00f00000
 * PREEMPT_NEED_RESCHED:    0x80000000 //bit31 int型最高bit，但<asm/preempt.h>中定義是BIT(32) !! */
#define PREEMPT_BITS    8
#define SOFTIRQ_BITS    8
#define HARDIRQ_BITS    4
#define NMI_BITS        4

#define PREEMPT_SHIFT    0
#define SOFTIRQ_SHIFT    (PREEMPT_SHIFT + PREEMPT_BITS) //8
#define HARDIRQ_SHIFT    (SOFTIRQ_SHIFT + SOFTIRQ_BITS) //16
#define NMI_SHIFT        (HARDIRQ_SHIFT + HARDIRQ_BITS) //20

#define __IRQ_MASK(x)    ((1UL << (x))-1)

#define PREEMPT_MASK    (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) //bit0-bit7   0x000000ff
#define SOFTIRQ_MASK    (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) //bit8-bit15  0x0000ff00
#define HARDIRQ_MASK    (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) //bit16-bit19 0x000f0000
#define NMI_MASK        (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)     //bit20-bit23 0x00f00000

#define PREEMPT_OFFSET    (1UL << PREEMPT_SHIFT) //1<<0
#define SOFTIRQ_OFFSET    (1UL << SOFTIRQ_SHIFT) //1<<8
#define HARDIRQ_OFFSET    (1UL << HARDIRQ_SHIFT) //1<<16
#define NMI_OFFSET        (1UL << NMI_SHIFT) 　　//1<<20

#define SOFTIRQ_DISABLE_OFFSET    (2 * SOFTIRQ_OFFSET) //2*(1<<8) == 1<<9

#define PREEMPT_DISABLED    (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) //1 + 1<<32

/*
 * Disable preemption until the scheduler is running -- use an unconditional
 * value so that it also works on !PREEMPT_COUNT kernels.
 *
 * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 */
#define INIT_PREEMPT_COUNT    PREEMPT_OFFSET //0

/*
 * Initial preempt_count value; reflects the preempt_count schedule invariant
 * which states that during context switches:
 *
 *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 *
 * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 * Note: See finish_task_switch().
 */
#define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) //2 + 1<<32

/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
#include <asm/preempt.h>

#define hardirq_count()    (preempt_count() & HARDIRQ_MASK)
#define softirq_count()    (preempt_count() & SOFTIRQ_MASK)
#define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK  | NMI_MASK))

/*
 * Are we doing bottom half or hardware interrupt processing?
 *
 * in_irq()       - We're in (hard) IRQ context
 * in_softirq()   - We have BH disabled, or are processing softirqs
 * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
 * in_serving_softirq() - We're in softirq context
 * in_nmi()       - We're in NMI context
 * in_task()      - We're in task context
 *
 * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really should not be used in new code.
 */
//各種上下文
#define in_irq()        (hardirq_count())
#define in_softirq()        (softirq_count())
#define in_interrupt()        (irq_count())
#define in_serving_softirq()    (softirq_count() & SOFTIRQ_OFFSET)
#define in_nmi()        (preempt_count() & NMI_MASK)
#define in_task()        (!(preempt_count() &  (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))

/*
 * The preempt_count offset after preempt_disable();
 */
# define PREEMPT_DISABLE_OFFSET    PREEMPT_OFFSET //1


/*
 * The preempt_count offset after spin_lock()
 */
#define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET

/*
 * The preempt_count offset needed for things like:
 *
 *  spin_lock_bh()
 *
 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
 * softirqs, such that unlock sequences of:
 *
 *  spin_unlock();
 *  local_bh_enable();
 *
 * Work as expected.
 */
#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)

/*
 * Are we running in atomic context?  WARNING: this macro cannot
 * always detect atomic context; in particular, it cannot know about
 * held spinlocks in non-preemptible kernels.  Thus it should not be
 * used in the general case to determine whether sleeping is possible.
 * Do not use in_atomic() in driver code.
 */
//判斷是否在原子上下文
#define in_atomic()    (preempt_count() != 0)

/*
 * Check whether we were atomic before we did preempt_disable():
 * (used by the scheduler)
 */
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)

#define preempt_count_add(val)    __preempt_count_add(val)
#define preempt_count_sub(val)    __preempt_count_sub(val)
#define preempt_count_dec_and_test() __preempt_count_dec_and_test()

#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)

#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)


#define preempt_disable() \
do { \
    preempt_count_inc(); \
    barrier(); \
} while (0)

#define sched_preempt_enable_no_resched() \
do { \
    barrier(); \
    preempt_count_dec(); \
} while (0)

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

#define preemptible()    (preempt_count() == 0 && !irqs_disabled())

#define preempt_enable() \
do { \
    barrier(); \
    if (unlikely(preempt_count_dec_and_test())) \
        __preempt_schedule(); \
} while (0)

#define preempt_enable_notrace() \
do { \
    barrier(); \
    if (unlikely(__preempt_count_dec_and_test())) \
        __preempt_schedule_notrace(); \
} while (0)

#define preempt_check_resched() \
do { \
    if (should_resched(0)) \
        __preempt_schedule(); \
} while (0)

#define preempt_disable_notrace() \
do { \
    __preempt_count_inc(); \
    barrier(); \
} while (0)

#define preempt_enable_no_resched_notrace() \
do { \
    barrier(); \
    __preempt_count_dec(); \
} while (0)


#define preempt_set_need_resched() \
do { \
    set_preempt_need_resched(); \
} while (0)
#define preempt_fold_need_resched() \
do { \
    if (tif_need_resched()) \
        set_preempt_need_resched(); \
} while (0)


struct preempt_notifier;

/**
 * preempt_ops - notifiers called when a task is preempted and rescheduled
 * @sched_in: we're about to be rescheduled:
 *    notifier: struct preempt_notifier for the task being scheduled
 *    cpu:  cpu we're scheduled on
 * @sched_out: we've just been preempted
 *    notifier: struct preempt_notifier for the task being preempted
 *    next: the task that's kicking us out
 *
 * Please note that sched_in and out are called under different
 * contexts.  sched_out is called with rq lock held and irq disabled
 * while sched_in is called without rq lock and irq enabled.  This
 * difference is intentional and depended upon by its users.
 */
struct preempt_ops {
    void (*sched_in)(struct preempt_notifier *notifier, int cpu);
    void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next);
};

/**
 * preempt_notifier - key for installing preemption notifiers
 * @link: internal use
 * @ops: defines the notifier functions to be called
 *
 * Usually used in conjunction with container_of().
 */
struct preempt_notifier {
    struct hlist_node link;
    struct preempt_ops *ops;
};

void preempt_notifier_inc(void);
void preempt_notifier_dec(void);
/*
 * tell me when current is being preempted & rescheduled.
 * sched/core.c中實現，kvm/kvm_main.c 中注冊
 */
void preempt_notifier_register(struct preempt_notifier *notifier);
/*
 * no longer interested in preemption notifications.
 * sched/core.c中實現，kvm/kvm_main.c 中注冊
 */
void preempt_notifier_unregister(struct preempt_notifier *notifier);

static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops)
{
    INIT_HLIST_NODE(&notifier->link);
    notifier->ops = ops;
}

/**
 * migrate_disable - Prevent migration of the current task
 *
 * Maps to preempt_disable() which also disables preemption. Use
 * migrate_disable() to annotate that the intent is to prevent migration,
 * but not necessarily preemption.
 *
 * Can be invoked nested like preempt_disable() and needs the corresponding
 * number of migrate_enable() invocations.
 */
static __always_inline void migrate_disable(void)
{
    preempt_disable();
}

/**
 * migrate_enable - Allow migration of the current task
 *
 * Counterpart to migrate_disable().
 *
 * As migrate_disable() can be invoked nested, only the outermost invocation
 * reenables migration.
 *
 * Currently mapped to preempt_enable().
 */
static __always_inline void migrate_enable(void)
{
    preempt_enable();
}

#endif /* __LINUX_PREEMPT_H */

二、preempt_count 各位段說明

1. 這個 per-任務的 counter 可以用來指示當前線程的狀態、它是否可以被搶占，以及它是否被允睡眠。

preempt_count 這個成員被用來判斷當前進程是否可以被搶占。如果 preempt_count 不等於0（可能是代碼調用preempt_disable顯式的禁止了搶占，也可能是處於中斷上下文等），說明當前不能進行搶占，如果 preempt_count 等於0，說明已經具備了搶占的條件（當然具體是否要搶占當前進程還是要看當前進程的 thread info 中的 flag 成員是否設定了 _TIF_NEED_RESCHED 這個標記，可能是當前的進程的時間片用完了，也可能是由於中斷喚醒了優先級更高的進程）。

2. Preemption-disable Count 位段

占8bit，用來記錄當前進程被顯式的禁止搶占的嵌套的次數。也就是說，每調用一次 preempt_disable()就會加1，調用 preempt_enable()，就會減1。preempt_disable() 和 preempt_enable() 必須成對出現，可以嵌套，最大嵌套的深度是255，因為只有8bit。

3. Software interrupt count

占8bit，用來記錄當前正在運行進程被軟中斷打斷嵌套的次數。對此位段進行操作有兩個場景：
(1) 也是在進入soft irq handler之前給此位段加1，退出soft irq handler之后給此位段減1。由於soft irq handler在一個CPU上是不會並發的，總是串行執行，因此，這個場景下只需要一個bit就夠了，也就是上圖中的bit8。通過該bit可以知道當前task是否在sofirq context。
(2) 由於內核同步的需求，進程上下文需要禁止 softirq。這時候，kernel提供了 local_bh_enable()和 local_bh_disable()這樣的接口函數。這部分的概念是和preempt disable/enable 類似的，占用了bit9--bit15，最大可以支持127次嵌套。

注：local_bh_disable()中還有對 preempt_count 加上16，local_bh_enable()中減去16，目前還不明白為什么要這樣做？

4. Hardware interrupt count 位段

占4bit，用來記錄當前正在運行進程被硬中斷打斷嵌套的次數，用來描述當前中斷handler嵌套的深度。對於ARM64平台的kernel-5.10，其中斷部分的代碼如下：

//kernel/irq/irqdesc.c
int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, bool lookup, struct pt_regs *regs)
{
    ...
    struct irq_desc *desc;

    irq_enter();
    generic_handle_irq_desc(desc); //desc->handle_irq(desc)
    irq_exit();
    ...
}

通用的IRQ handler被 irq_enter()和 irq_exit()這兩個函數包圍。irq_enter()說明進入到IRQ context，而 irq_exit()則說明退出IRQ context。在irq_enter()函數中會調用 preempt_count_add(HARDIRQ_OFFSET)，為"Hardware interrupt count"的bit field增加1。在 irq_exit()函數中，會調用 preempt_count_sub(HARDIRQ_OFFSET) 減去1。占用了4bit說明硬件中斷handler最大可以嵌套15層。
在舊的內核中，占12bit，支持4096個嵌套。當然，在舊的kernel中還區分fast interrupt handler和slow interrupt handler，中斷handler最大可以嵌套的次數理論上等於系統IRQ的個數。在實際中，這個數目不可能那么大（內核棧就受不了），因此，即使系統支持了非常大的中斷個數，也不可能各個中斷依次嵌套，達到理論的上限。基於這樣的考慮，后來內核減少為10bit（在general arch的代碼中修改為10，實際上，各個arch可以redefine自己的hardirq count的bit數）。但是，當內核大佬們決定廢棄slow interrupt handler的時候，實際上，中斷的嵌套已經不會發生了。因此，理論上，hardirq count要么是0，要么是1。不過呢，不能總拿理論說事，實際上，萬一有寫奇葩或者老古董driver在handler中打開中斷，那么這時候中斷嵌套還是會發生的，但是，應該不會太多，因此，目前占用4bit，應付15個奇葩driver是妥妥的。

5. Reschedule needed位段
最高 bit31 這個"reschedule needed"位告訴內核，當前有一個優先級較高的進程應該在第一時間獲得CPU。必須要在 preempt_count 為非零值的情況下，才會設置這個bit，否則的話內核早就可以直接對這個任務進行搶占，而沒必要設置此bit並等待。

三、task的各種上下文

1. 各種context在 include/linux/preempt.h 中的定義：

/*
 * Are we doing bottom half or hardware interrupt processing?
 *
 * in_irq()               - We're in (hard) IRQ context
 * in_softirq()           - We have BH disabled, or are processing softirqs
 * in_interrupt()         - We're in NMI,IRQ,SoftIRQ context or have BH disabled
 * in_serving_softirq() - We're in softirq context
 * in_nmi()               - We're in NMI context
 * in_task()              - We're in task context
 *
 * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really should not be used in new code.
 */
#define in_irq()        (hardirq_count()) //preempt_count() & HARDIRQ_MASK
#define in_softirq()    (softirq_count()) //preempt_count() & SOFTIRQ_MASK
#define in_interrupt()    (irq_count())     //preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK)
#define in_serving_softirq()    (softirq_count() & SOFTIRQ_OFFSET) //preempt_count() & SOFTIRQ_OFFSET 只使用最低bit
#define in_nmi()        (preempt_count() & NMI_MASK)
#define in_task()        (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))

2. irq context 其實就是 hard irq context，也就是說明當前正在執行中斷handler（top half），只要 preempt_count 中的 hardirq count 大於0（＝1是沒有中斷嵌套，如果大於1，說明有中斷嵌套），那么就是IRQ context。

3. softirq context 並沒有那么的直接，一般人會認為當 sofirq handler 正在執行的時候就是 softirq context。這樣說當然沒有錯，sofirq handler 正在執行的時候，會增加"Software interrupt count"，當然是softirq context。不過，在其他context的情況下，例如進程上下文中，有可能因為同步的要求而調用local_bh_disable()，這時候，通過 local_bh_disable()/local_bh_enable() 保護起來的代碼也是執行在 softirq context 中。當然，這時候其實並沒有正在執行softirq handler。如果你確實想知道當前是否正在執行 softirq handler，可以使用 in_serving_softirq()來完成這個使命，這是通過操作 preempt_count 的bit8來完成的。

4. 所謂中斷上下文，由 in_interrupt() 來表示，就是 IRQ context + softirq context + NMI context。

5. 進程上下文由 in_task() 表示，local_bh_disable()/local_bh_enable()保護的區間，仍然屬於進程上下文，感覺與in_softirq()有沖突，此時就既屬於軟中斷上下文又屬於進程上下文了。

四、總結

1. 只要看一下 preempt_count() 的值，內核就可知道當前的情況如何，比如 preempt_count() 值是非零值，就表示當前線程不能被 scheduler 搶占，因為此時要么是搶占已經被明確被禁止了，要么是CPU當前正在處理某種中斷。同理，非零值也表示當前線程不能睡眠(待確認！)。

2. preempt_disable()只適用於線程在kernel里運行的情況，而用戶空間的代碼則總是可以被搶占的。

3. 一個問題：如果配置為非搶占式內核，內核代碼不能被搶占，那么就沒有必要跟蹤記錄 preempt_disable()了，因為搶占是永遠被關閉，不用浪費時間去維護這些信息，因此此時 preempt_count 的 preempt-disable 這幾個 bit 總是為0，preemptible() 函數將總是返回 false。非搶占式內核中，在某些情況下，例如當 spin_lock 被持有時（這種情況確實是 atomic context），in_atomic()卻會由於這個 preempt_count()==0 而返回 false。

4. might_sleep(): 指示當前函數可以睡眠。如果它所在的函數處於原子上下文(atomic context)中，將打印出堆棧的回溯信息。這個函數主要用來做調試工作，在你不確定不期望睡眠的地方是否真的不會睡眠時，就把這個宏加進去。對於內核Release版本，一般沒有使能 CONFIG_DEBUG_ATOMIC_SLEEP，might_sleep() 就是一個空函數。

5. spin_lock()/spin_unlock()保護的區間不能休眠的原因

static __always_inline void spin_lock(spinlock_t *lock)
{
    do { preempt_disable(); ___LOCK(&lock->rlock); } while (0);
}

spin_lock 在獲取不到鎖的時候就會自旋等待。在嘗試獲取鎖之前會先關搶占，若是一個進程A在獲取到鎖之后休眠了，此時調度器將調度其它任務B在當前CPU上運行，若任務B也獲取這個鎖，若是會獲取不到從而進入自旋狀態，此時又是關閉搶占的，此CPU始終無法調度其它任務到當前CPU上運行，則當前CPU則會死鎖在任務B上。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 LInux中ThreadInfo中的preempt_count字段 java中獲取各種上下文路徑的方法小結 AspNet MVC中各種上下文理解 AspNet MVC中各種上下文理解 linux進程、調度、線程、進程上下文等幾點理解線程上下文加載器的理解 python contextlib 上下文管理器 django 上下文渲染器線程上下文類加載器 Python 的上下文管理器是怎么設計的？