調度器32—RT選核


基於Linux-5.10

一、RT選核流程

1. 主要調用路徑

rt_sched_class.select_task_rq //RT調度類回調
    select_task_rq_rt //rt.c 前面trace_android_rvh_select_task_rq_rt()若是選到cpu就直接退出了; 若test或cpu算力不滿足時調用
        find_lowest_rq //rt.c
            trace_android_rvh_find_lowest_rq(task, lowest_mask, ret, &cpu);

 

二、select_task_rq_rt 函數

1. 三種選核路徑傳參

try_to_wake_up //core.c
    select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); //喚醒選核路徑

wake_up_new_task //core.c
    select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0); //fork選核路徑

sched_exec //core.c
    select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); //exec選核路徑

注:傳參cpu p->wake_cpu 就是p上次運行的cpu.

static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) //rt.c
{
    struct task_struct *curr;
    struct rq *rq;
    struct rq *this_cpu_rq;
    bool test;
    int target_cpu = -1;
    bool may_not_preempt;
    bool sync = !!(flags & WF_SYNC);
    int this_cpu;

    //插入HOOK
    trace_android_rvh_select_task_rq_rt(p, cpu, sd_flag, flags, &target_cpu); //mtk_select_task_rq_rt
    if (target_cpu >= 0)
        return target_cpu;

    /* For anything but wake ups, just return the task_cpu */
    //也是只對喚醒和fork新任務場景調用, 另一種 SD_BALANCE_EXEC 的不走這里
    if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
        goto out;

    rq = cpu_rq(cpu); //任務上次運行的cpu的rq

    rcu_read_lock();
    curr = READ_ONCE(rq->curr); /* unlocked access */ //上次運行的cpu正在執行的任務
    this_cpu = smp_processor_id(); //當前cpu
    this_cpu_rq = cpu_rq(this_cpu); //當前cpu的rq

    /*
     * If the current task on @p's runqueue is a softirq task,
     * it may run without preemption for a time that is
     * ill-suited for a waiting RT task. Therefore, try to
     * wake this RT task on another runqueue.
     *
     * Also, if the current task on @p's runqueue is an RT task, then
     * try to see if we can wake this RT task up on another
     * runqueue. Otherwise simply start this RT task
     * on its current runqueue.
     *
     * We want to avoid overloading runqueues. If the woken
     * task is a higher priority, then it will stay on this CPU
     * and the lower prio task should be moved to another CPU.
     * Even though this will probably make the lower prio task
     * lose its cache, we do not want to bounce a higher task
     * around just because it gave up its CPU, perhaps for a
     * lock?
     *
     * For equal prio tasks, we just let the scheduler sort it out.
     *
     * Otherwise, just let it ride on the affined RQ and the
     * post-schedule router will push the preempted task away
     *
     * This test is optimistic, if we get it wrong the load-balancer
     * will have to sort it out.
     *
     * We take into account the capacity of the CPU to ensure it fits the
     * requirement of the task - which is only important on heterogeneous
     * systems like big.LITTLE.
     */
    //主要是判斷幾類softirq,返回假表示可搶占,curr表示任務p上次運行的cpu上當前運行的任務
    may_not_preempt = task_may_not_preempt(curr, cpu);
    //任務p之前運行的cpu上正在運行的任務當前不可被搶占或是綁核的RT,或優先級比當前任務還高的RT
    test = (curr && (may_not_preempt || (unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio))));

    /*
     * Respect the sync flag as long as the task can run on this CPU.
     */
    //若是被RT任務sync喚醒且當前cpu上正在運行RT任務的優先級比p低,且當前cpu在任務p的親和性中,就選當前cpu
    if (should_honor_rt_sync(this_cpu_rq, p, sync) && cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
        cpu = this_cpu;
        goto out_unlock;
    }

    /*
     * 若p不能運行在之前運行的cpu上,或p之前運行的cpu算力不滿足p的需求了,才進行后續的選核。
     *
     * 這個條件判斷應該很可能為假,也即p可以運行在之前運行的cpu上且之前運行的cpu滿足p的算力需求。也就是說
     * 任務p很可能運行在之前運行過的cpu上,==> RT線程對算力滿足需求的之前運行過的cpu有親和性!一定概率下不
     * 會走后續的選核流程。
     */
    if (test || !rt_task_fits_capacity(p, cpu)) {
        //這里是主要的選核邏輯
        int target = find_lowest_rq(p);

        /*
         * Bail out if we were forcing a migration to find a better
         * fitting CPU but our search failed.
         */
        /*
         * 若p能運行在之前運行的cpu上,且這里選出的cpu也不滿足算力需求,就選任務p之前運行的cpu,
         * 即使之前運行的cpu的算力也不滿足. ==> 對之前運行過的cpu有親和性
         */
        if (!test && target != -1 && !rt_task_fits_capacity(p, target))
            goto out_unlock;

        /*
         * If cpu is non-preemptible, prefer remote cpu
         * even if it's running a higher-prio task.
         * Otherwise: Don't bother moving it if the destination CPU is
         * not running a lower priority task.
         */
        /*
         * 選出了目標cpu且,且p不能搶占之前運行的cpu或p的優先級高於選出的cpu的rq上最高優任務的先級,就選新
         * 選出的cpu,否則不賦值,還是選之前cpu。
         */
        if (target != -1 && (may_not_preempt || p->prio < cpu_rq(target)->rt.highest_prio.curr))
            cpu = target;
    }

out_unlock:
    rcu_read_unlock();

out:
    return cpu;
}

2. 函數總結:
(1) 若是沒有選到目標cpu,就返回任務p上次運行的cpu。
(2) trace_android_rvh_select_task_rq_rt 這個hook中傳遞了上層的所有參數,vendor可以在這里定制選核邏輯。
(3) 只有喚醒場景和fork新任務場景的才走選核流程,exec執行場景的選核直接返回之前運行的cpu作為目標cpu。
(4) 若是被RT任務sync喚醒且當前cpu上正在運行RT任務的優先級比p低,且當前cpu在任務p的親和性中,就選當前cpu作為目標cpu。
(5) 若p不能運行在之前運行的cpu上,或p之前運行的cpu算力不滿足p的需求了,才會繼續選核,否則選p之前運行的cpu。說明RT任務對之前運行的cpu有一定的“親和性”。
(6) 主要的選核邏輯在 find_lowest_rq() 中。

 

三、find_lowest_rq 函數

1. select_task_rq_rt 傳參為待選核的任務

static int find_lowest_rq(struct task_struct *task)
{
    struct sched_domain *sd;
    //static全局變量,使用之前還是空的
    struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
    int this_cpu = smp_processor_id(); //當前正在運行的cpu
    int cpu      = -1;
    int ret;

    /* Make sure the mask is initialized first */
    if (unlikely(!lowest_mask))
        return -1;

    //對於綁核的RT線程直接返回
    if (task->nr_cpus_allowed == 1)
        return -1; /* No other targets possible */

    /*
     * If we're on asym system ensure we consider the different capacities
     * of the CPUs when searching for the lowest_mask.
     */
    if (static_branch_unlikely(&sched_asym_cpucapacity)) {
        //這個完全執行在前,lowest_mask 里面要么都是滿足算力需求的cpu,要么都是不滿足算力需求的cpu(之后大概率選之前的cpu)
        ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, rt_task_fits_capacity);
    } else {
        ret = cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask);
    }

    //走到這里時,lowest_mask中可能是滿足算力需求的cpu,也可能不是。
    //這個hook中vendor可以修改候選cpu。
    trace_android_rvh_find_lowest_rq(task, lowest_mask, ret, &cpu); //HOOK
    if (cpu >= 0)
        return cpu;

    if (!ret)
        return -1; /* No targets found */

    cpu = task_cpu(task); //待選核rt任務之前運行的cpu

    /*
     * At this point we have built a mask of CPUs representing the
     * lowest priority tasks in the system.  Now we want to elect
     * the best one based on our affinity and topology.
     *
     * We prioritize the last CPU that the task executed on since
     * it is most likely cache-hot in that location.
     */
    //若p之前運行的cpu在候選cpu中,那么就選之前運行的cpu,以便利用cache-hot特性
    if (cpumask_test_cpu(cpu, lowest_mask))
        return cpu;

    /*
     * Otherwise, we consult the sched_domains span maps to figure
     * out which CPU is logically closest to our hot cache data.
     * 翻譯:
     * 否則,我們會查閱 sched_domains 中的cpu以確定哪個 CPU 在邏輯上最
     * 接近我們的熱緩存數據。
     */
    //若當前cpu不在候選cpu中就將 this_cpu 設為-1
    if (!cpumask_test_cpu(this_cpu, lowest_mask))
        this_cpu = -1; /* Skip this_cpu opt if not among lowest */

    rcu_read_lock();
    for_each_domain(cpu, sd) { //MC-->DIE
        if (sd->flags & SD_WAKE_AFFINE) { //MC和DIE都有這個標志
            int best_cpu;

            /* "this_cpu" is cheaper to preempt than a remote processor.*/
            /*
             * 當前cpu在候選cpu中且當前cpu和p之前運行的cpu在同一個cluster內(MC的span為本cluster,DIE的span為所有cpu),
             * 就返回當前cpu作為目標cpu.
             */
            if (this_cpu != -1 && cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
                rcu_read_unlock();
                return this_cpu;
            }

            //選候選cpu和sd->span交集的第一個cpu做為目標cpu
            best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd));
            if (best_cpu < nr_cpu_ids) {
                rcu_read_unlock();
                return best_cpu;
            }
        }
    }
    //對於手機,上面肯定已經返回了,下面不會執行---------------------------------------。
    
    rcu_read_unlock();

    /*
     * And finally, if there were no matches within the domains
     * just give the caller *something* to work with from the compatible
     * locations.
     */
    //若到此還沒選到任何cpu,且當前cpu在候選cpu中,就選當前cpu吧。
    if (this_cpu != -1)
        return this_cpu;

    //從候選cpu中任選一個cpu作為目標cpu
    cpu = cpumask_any(lowest_mask);
    if (cpu < nr_cpu_ids)
        return cpu;

    return -1;
}

2. 函數總結:
(1) 先調用 cpupri_find_fitness() 候選cpu放到 lowest_mask 中,由於此函數在選不到候選cpu的時候后舍去 fitness_fn 回調重新選擇一次。因此lowest_mask 中的候選cpu可能是都是算力滿足待選核任務p需求的,或是都不滿足p需求的。
(2) trace_android_rvh_find_lowest_rq 允許vendor廠商插入hook來更改候選cpu或指定目標cpu
(3) 確定候選cpu的 lowest_mask 后,此時就有再次擇優選擇的資本了,選擇優先級為:
a. 若p之前運行的cpu在候選cpu中,那么就選之前運行的cpu,以便利用cache-hot特性(考慮一級cache)。
b. 若當前cpu在候選cpu中,且當前cpu和p之前運行的cpu位於同一cluster,則選當前cpu(考慮二級cache)。
c. 選候選cpu和sd->span交集的第一個cpu做為目標cpu,即選任務p之前運行的cluster的一個cpu(考慮二級cache)。
d. 若當前cpu在候選cpu中,則選當前cpu。
e. 選候選cpu中的第一個cpu。

 

四、cpupri_find_fitness 函數

1. find_lowest_rq調用傳參(&task_rq(task)->rd->cpupri, task, lowest_mask, rt_task_fits_capacity)

cp 是全局唯一的,p 是待選核任務,lowest_mask 是剛初始化還沒使用的,fitness_fn 是回調 rt_task_fits_capacity

int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
    struct cpumask *lowest_mask, bool (*fitness_fn)(struct task_struct *p, int cpu)) //cpupri.c
{
    int task_pri = convert_prio(p->prio);
    int idx, cpu;
    bool drop_nopreempts = task_pri <= MAX_RT_PRIO; //100 只有prio=0的最高優先級的RT任務不滿足

    BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); //102 convert_prio 轉換后最大是101

#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
retry:
#endif
    //cpupri優先級越高就越大,idx是cpupri的優先級,對應101 - p->prio
    for (idx = 0; idx < task_pri; idx++) {
        //若選到了cpu,__cpupri_find 返回1
        if (!__cpupri_find(cp, p, lowest_mask, idx, drop_nopreempts))
            continue;

        //兩個指針若有一個為NULL就直接返回
        if (!lowest_mask || !fitness_fn)
            return 1;

        /* Ensure the capacity of the CPUs fit the task */
        //對於 lowest_mask 中選出的cpu,剔除算力不滿足需求的cpu。
        for_each_cpu(cpu, lowest_mask) {
            if (!fitness_fn(p, cpu))
                cpumask_clear_cpu(cpu, lowest_mask);
        }

        /*
         * If no CPU at the current priority can fit the task
         * continue looking
         */
        if (cpumask_empty(lowest_mask))
            continue;

        //一般情況下,選到核了就從這里返回了
        return 1;
    }

    /*
     * If we can't find any non-preemptible cpu's, retry so we can
     * find the lowest priority target and avoid priority inversion.
     */
#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
    //大概率不執行
    if (drop_nopreempts) {
        drop_nopreempts = false;
        goto retry;
    }
#endif

    /*
     * If we failed to find a fitting lowest_mask, kick off a new search
     * but without taking into account any fitness criteria this time.
     *
     * This rule favours honouring priority over fitting the task in the
     * correct CPU (Capacity Awareness being the only user now).
     * The idea is that if a higher priority task can run, then it should
     * run even if this ends up being on unfitting CPU.
     *
     * The cost of this trade-off is not entirely clear and will probably
     * be good for some workloads and bad for others.
     *
     * The main idea here is that if some CPUs were overcommitted, we try
     * to spread which is what the scheduler traditionally did. Sys admins
     * must do proper RT planning to avoid overloading the system if they
     * really care.
     */
    /*
     * 若還是沒有選到核,走這里,其是不再提供過濾回調函數,再重新調用一次
     * cpupri_find_fitness(), 這次就不考慮RT任務算力需求了,__cpupri_find()
     * 選到核后就直接返回了。
     * TODO: 此情況下可以盡量選中核大核。
     */
    if (fitness_fn)
        return cpupri_find(cp, p, lowest_mask);

    return 0;
}
EXPORT_SYMBOL_GPL(cpupri_find_fitness);


// cpupri_find_fitness傳參:(cp, p, lowest_mask)
int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask)
{
    return cpupri_find_fitness(cp, p, lowest_mask, NULL);
}


/*
 * cpupri_find_fitness 傳參:(cp, p, lowest_mask, idx, drop_nopreempts)
 * drop_nopreempts 只有 p->prio=0 的最高RT優先級才會為真.
 * 選到了cpu返回真。
 */
static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
    struct cpumask *lowest_mask, int idx, bool drop_nopreempts)
{
    struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
    int skip = 0;

    if (!atomic_read(&(vec)->count))
        skip = 1;

    smp_rmb();

    /* Need to do the rmb for every iteration */
    if (skip)
        return 0;

    if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
        return 0;

    if (lowest_mask) {
        //與兩次
        cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
        cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);

#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
        if (drop_nopreempts)
            drop_nopreempt_cpus(lowest_mask);
#endif

        /*
         * We have to ensure that we have at least one bit
         * still set in the array, since the map could have
         * been concurrently emptied between the first and
         * second reads of vec->mask.  If we hit this
         * condition, simply act as though we never hit this
         * priority level and continue on.
         */
        if (cpumask_empty(lowest_mask))
            return 0;
    }

    return 1;
}

2. 函數總結:
(1) 會先帶着過濾回調函數 fitness_fn 選一次候選cpu,若是沒有選到,就取消過濾函數回調重新選擇一次。

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM