調度器16—core_ctl

本文轉載自查看原文 2021-12-06 16:59 97 進程管理

基於MTK Linux-5.10

一、相關文件接口

1. parameters文件接口

/sys/module/mtk_core_ctl/parameters # ls -l
-rw------- 1 root   root   debug_enable //控制 core_ctl.c 中 core_ctl_debug() 的打印，TAG為"core_ctl"
-rw-rw---- 1 system system policy_enable

(1) debug_enable

默認為false, 控制 core_ctl.c 中 core_ctl_debug() 的打印，TAG為"core_ctl"

(2) policy_enable

默認為false, 從 demand_eval() 來看，若 policy_enable 文件沒有使能的話，那么 need_cpus 直接取 cluster->max_cpus，此時cpu核的isolate/ioslate只受用戶空間通過core_ctl下面的文件節點和是否boost進行設置了。

2. core_ctl文件接口

/sys/devices/system/cpu/cpu0/core_ctl # ls -l
-rw------- 1 root root core_ctl_boost
-rw------- 1 root root enable
-r-------- 1 root root global_state
-rw-rw-r-- 1 root root max_cpus
-rw-rw-r-- 1 root root min_cpus
-rw------- 1 root root not_preferred
-rw------- 1 root root offline_throttle_ms
-r-------- 1 root root ppm_state //顯示一個表
-r-------- 1 root root thermal_up_thres
-rw------- 1 root root up_thres

(1) core_ctl_boost

對應 cluster_data::boost 成員，默認是false，設置為1是對所有cluster進行boost，在 demand_eval() 中判斷，若是boot狀態的話，need_cpus 直接取 cluster->max_cpus，也就是不再執行實際的isolate動作了，若是有isolate的cpu，需要unisolate。

(2) enable

對應 cluster_data::enable 成員，默認是true，在 demand_eval() 中判斷，若是沒有enable的話，need_cpus 直接取 cluster->max_cpus，也就是不再執行實際的isolate動作了，若是有isolate的cpu，需要unisolate。

(3) global_state

打印cluster中cpu的Active cpu個數，Need cpu個數，Paused cpu個數，以及cluster內各個cpu的 oneline、pause、busy、prefer 狀態，見下面cat的內容。

(4) max_cpus

對應 cluster_data::max_cpus 成員，線程do_core_ctl()中在執行實際isolate/unisolate之前會先執行 apply_limits(cluster, cluster->need_cpus) 將 need_cpus 鉗制在 cluster->min_cpus 和 cluster->max_cpus 之間，也就是說默認邏輯會尊重用戶空間對cpu核數的限制，用戶空間的限制優先級最高，高於 core_ctl_tick() 執行邏輯中預估的核數。但是通過 eas_ioctl 文件設置下來的不在尊重用戶空間對cpu的限制了。

(5) min_cpus

對應 cluster_data::min_cpus 成員，各個cluster默認取值default_min_cpus[MAX_CLUSTERS] = {4, 2, 0}。設置后立即喚醒"core_ctl_v2/X"線程執行isolate/unisolate操作。

(6) not_preferred

對應 cluster_data::not_preferred 成員，如果標記了某些CPU是not_preferred，那么在 try_to_pause() 中 isolate CPU的時候就會優先isolate這些not_preferred的CPU，若是not_preferred的CPU都已經isolated了還沒達到 active_cpus == need 這個條件，那么就繼續isolate沒有被標記為not_preferred的CPU。

二、core_ctl設置路徑

1. scheduler_tick 周期更新cpu核數需求，觸發isolate/unisolate

(1) 調用路徑

scheduler_tick //core.c
    core_ctl_tick //core_ctl.c trace_android_vh_scheduler_tick(rq) 將per_cpu的4ms的窗口轉化為全局4ms的窗口，每4ms實際調用一次
        if (enable_policy)
            core_ctl_main_algo(); //通過一定算法更新 cluster->new_need_cpus
        apply_demand //core_ctl.c 對每一個cluster都調用
            for_each_cluster(cluster, index)
                apply_demand(cluster) //core_ctl.c
                    if (demand_eval(cluster))
                        wake_up_core_ctl_thread(cluster); //喚醒per-cluster的內核線程"core_ctl_v2/X"
                            try_core_ctl //core_ctl.c per-cluster的內核線程"core_ctl_v2/X"，內核優先級為0的RT線程，平時休眠，有core control需求時喚醒它
                                do_core_ctl

(2) 相關函數

static void __ref do_core_ctl(struct cluster_data *cluster) //core_ctl.c
{
    ...
    //返回將 cluster->need_cpus 鉗制在 cluster->min_cpus 和 cluster->max_cpus 之間的值
    need = apply_limits(cluster, cluster->need_cpus);
    //need小於cluster->active_cpus 或 need大於cluster->active_cpus並且cluster->nr_paused_cpus不為0
    if (adjustment_possible(cluster, need)) {
        if (cluster->active_cpus > need)
            try_to_pause(cluster, need);
        else if (cluster->active_cpus < need)
            try_to_resume(cluster, need);
    }
    ...
}            
                        
try_to_pause //core_ctl.c 一直去pause，直到 cluster->active_cpus 等於參數 need，過程中實時更新 cluster->active_cpus 和 cluster->nr_paused_cpus
    sched_pause_cpu //core_pause.c pause一個cpu
        pause_cpus //kernel/cpu.c

try_to_resume //core_ctl.c 一直去resume，直到 cluster->active_cpus 等於參數 need，過程中實時更新 cluster->active_cpus 和 cluster->nr_paused_cpus
    sched_resume_cpu //core_pause.c resume一個cpu
        resume_cpus //kernel/cpu.c


static void try_to_pause(struct cluster_data *cluster, int need)
{
    unsigned long flags;
    unsigned int num_cpus = cluster->num_cpus;
    //檢查此cluster中是否有標記not_preferred cpu
    bool check_not_prefer = cluster->nr_not_preferred_cpus;
    bool check_busy = true;

again:
    for (cpu = nr_cpu_ids-1; cpu >= 0; cpu--) {
        struct cpu_data *c;

        success = false;
        if (!cpumask_test_cpu(cpu, &cluster->cpu_mask))
            continue;

        if (!num_cpus--)
            break;

        c = &per_cpu(cpu_state, cpu);
        if (!is_active(c))
            continue;

        //若此cluster中只要有一個cpu的算力使用百分比c->cpu_util_pct 不低於 cluster->cpu_busy_up_thres 就認為是busy
        if (check_busy && c->is_busy)
            continue;

        //per_ioctl強制isolate的cpu
        if (c->force_paused)
            continue;

        //直到active==need才退出pause，否則一直嘗試pause
        if (cluster->active_cpus == need)
            break;

        //僅 Pause not_preferred 的 CPU，如果沒有 CPU 被選為 not_preferred，則所有 CPU 都符合隔離條件。
        if (check_not_prefer && !c->not_preferred)
            continue;

        //執行isolate cpu 操作
        if (!sched_pause_cpu(c->cpu)) {
            if (cpu_online(c->cpu))
                //記錄是由core_ctl isolate 的
                c->paused_by_cc = true;
        }
        cluster->active_cpus = get_active_cpu_count(cluster);
    }

    cluster->nr_paused_cpus += nr_paused;

    if (check_busy || (check_not_prefer && cluster->active_cpus != need)) {
        num_cpus = cluster->num_cpus;
        check_not_prefer = false; //改為false重新試一次
        check_busy = false;
        goto again;
    }
}

sched_pause_cpu --> pause_cpus

//參數為要pause的cpu的mask
int pause_cpus(struct cpumask *cpus) //kernel/cpu.c
{
    ...
    if (cpu_hotplug_disabled) { //需要沒有禁止 cpu_hotplug 才能pause
        err = -EBUSY;
        goto err_cpu_maps_update;
    }

    //只能對active的cpu進行pause
    cpumask_and(cpus, cpus, cpu_active_mask);

    for_each_cpu(cpu, cpus) {
        //cpu是offline的，或dl任務帶寬不夠，是不能pasue的
        if (!cpu_online(cpu) || dl_cpu_busy(cpu) || get_cpu_device(cpu)->offline_disabled == true) {
            err = -EBUSY;
            goto err_cpu_maps_update;
        }
    }

    //不能pause所有的active的cpu
    if (cpumask_weight(cpus) >= num_active_cpus()) {
        err = -EBUSY;
        goto err_cpu_maps_update;
    }

    //將要pause的cpu設置為非active的狀態，就是從 cpu_active_mask 中清除掉
    for_each_cpu(cpu, cpus)
        set_cpu_active(cpu, false); //被isolate的cpu不會再出現在 cpu_active_mask 中 ######
    
    //進行pause
    err = __pause_drain_rq(cpus);

    trace_cpuhp_pause(cpus, start_time, 1);

    return err;
}

2. perf_ioctl 中強制core_ctl接口

/proc/perfmgr/eas_ioctl 這里會強制進行core_ctl

static long eas_ioctl_impl(struct file *filp, unsigned int cmd, unsigned long arg, void *pKM) //perf_ioctl.c
{
    struct _CORE_CTL_PACKAGE msgKM = {0};
    ...
    switch (cmd) {
    case CORE_CTL_FORCE_PAUSE_CPU: //這是強制進行核隔離
        if (perfctl_copy_from_user(&msgKM, ubuf, sizeof(struct _CORE_CTL_PACKAGE)))
            return -1;

        bval = !!msgKM.is_pause;
        ret = core_ctl_force_pause_cpu(msgKM.cpu, bval);
        break;
    ...
    }
    ...
}

//is_pause: 1 pause, 0 resume
int core_ctl_force_pause_cpu(unsigned int cpu, bool is_pause)
{
    int ret;
    struct cpu_data *c;
    struct cluster_data *cluster;
    ...

    if (!cpu_online(cpu))
        return -EBUSY;

    c = &per_cpu(cpu_state, cpu);
    cluster = c->cluster;

    //執行實際的pause和resume
    if (is_pause)
        ret = sched_pause_cpu(cpu);
    else
        ret = sched_resume_cpu(cpu);

    //標記是force接口pause的
    c->force_paused = is_pause;
    if (c->paused_by_cc) {
        c->paused_by_cc = false;
        cluster->nr_paused_cpus--;
    }
    cluster->active_cpus = get_active_cpu_count(cluster);

    return ret;
}

若是通過 perf_ioctl 接口強制isolate的CPU，其 cpu_data::force_paused 會設置為1，是直接調用 sched_pause_cpu/sched_resume_cpu進行隔離和取消隔離的。在原路徑"core_ctl_v2/X"線程中isolate/unisolate執行流程中會跳過設置了 c->force_paused 標志位的CPU，也就是說force isolate的CPU必須要force接口unisolate!

3. 通過 max_cpus/min_cpus 文件接口設置

通過設置 /sys/devices/system/cpu/cpuX/core_ctl 下的 max_cpus、min_cpus 文件接口進行設置，

static void set_min_cpus(struct cluster_data *cluster, unsigned int val)
{
    ...
    cluster->min_cpus = min(val, cluster->max_cpus);
    ...
    //喚醒"core_ctl_v2/X"線程
    wake_up_core_ctl_thread(cluster);
}

static void set_max_cpus(struct cluster_data *cluster, unsigned int val) //core_ctl.c
{
    ...
    val = min(val, cluster->num_cpus);
    cluster->max_cpus = val;
    //這樣的效果就是想限核只需要往 max_cpus 一個文件中echo一個值就可以了
    cluster->min_cpus = min(cluster->min_cpus, cluster->max_cpus);
    ...
    //喚醒"core_ctl_v2/X"線程
    wake_up_core_ctl_thread(cluster);
}

/sys/devices/system/cpu/cpu0/core_ctl # cat min_cpus
4
/sys/devices/system/cpu/cpu0/core_ctl # echo 1 > max_cpus
/sys/devices/system/cpu/cpu0/core_ctl # cat max_cpus
1
/sys/devices/system/cpu/cpu0/core_ctl # cat min_cpus
1

總結：core_ctl_tick 和 max_cpus/min_cpus 設置路徑都是通過喚醒優先級為0的RT線程"core_ctl_v2/X"來執行核隔離和取消隔離的，只不過前者更新核需求 new_need_cpus 參數，后者是增加核數限制。force路徑是直接調用pause接口進行隔離和取消隔離，而且其操作過的cpu不受"core_ctl_v2/X"線程的影響。resume_cpus 是相反操作。

三、調試log

1. 相關trace

(1) trace_core_ctl_demand_eval

//調用傳參：
demand_eval
    trace_core_ctl_demand_eval(cluster->cluster_id, old_need, new_need, cluster->active_cpus,
        cluster->min_cpus, cluster->max_cpus, cluster->boost, cluster->enable, ret && need_flag);

//trace打印：
        <idle>-0       [006] d.h3  2007.792026: core_ctl_demand_eval: cid=0, old=2, new=4, act=2 min=0 max=4 bst=0 enbl=1 update=1
core_ctl_v2/0-463      [006] d.h3  2007.796037: core_ctl_demand_eval: cid=0, old=4, new=4, act=3 min=0 max=4 bst=0 enbl=1 update=1

打印依次為傳入的參數，只有 update=1 才會喚醒core_ctl線程，執行進一步isolate/unisolate操作。

(2) trace_core_ctl_algo_info

//調用傳參：
core_ctl_main_algo
    trace_core_ctl_algo_info(big_cpu_ts, heaviest_thres, max_util, cpumask_bits(cpu_active_mask)[0], orig_need_cpu);

//trace打印：
sh-18178   [004] d.h2 18903.565478: core_ctl_algo_info: big_cpu_ts=67692 heaviest_thres=770 max_util=786 active_cpus=f1 orig_need_cpus=4|9|6

big_cpu_ts: 是大核cpu7的溫度，67.692度
heaviest_thres: 作為判斷是否需要開啟大核的util門限，當溫度低於65度時是中核 up_thres/100 * max_capacity, 高於65度時是 thermal_up_thres/100 * max_capacity
max_util：記錄的是所有cpu上最大task的util，在每8ms執行一次的 sched_max_util_task_tracking() 中更新。
active_cpus：打印的是 cpu_active_mask，通過它可以看哪些cpu被隔離了或被設置為offline了，實測被isolate或offline都會體現到 cpu_active_mask 上。
orig_need_cpus：是個數組，依次打印各個cluster的 cluster->new_need_cpus 成員，就是評估出來的各個cluster需要的cpu核心的個數。

注：可以看到MTK的 new_need_cpus 的評估算法明顯不行，飛行熄屏場景下竟然評估出各個cluster需要 4|9|6 個核。

(3) trace_core_ctl_update_nr_over_thres

//調用傳參：
scheduler_tick //core.c
    core_ctl_tick //core_ctl.c
        core_ctl_main_algo
            get_nr_running_big_task
                trace_core_ctl_update_nr_over_thres(nr_up, nr_down, max_nr)

//trace打印：
sh-18174   [006] dNh2 23927.901480: core_ctl_update_nr_over_thres: nr_up=1|0|0 nr_down=0|5|0 max_nr=2|4|4

分別打印的是每個cluster的 cluster_data 中的 nr_up, nr_down, max_nr，在 core_ctl_main_algo() 中評估 cluster->new_need_cpus 時使用。

由 "dNh2" 可知，此函數是在硬中斷上下文中關中斷執行的，此時搶占計數為2。

2. 開啟 debug log

若是出問題了可以 echo 1 > /sys/module/mtk_core_ctl/parameters/debug_enable 打開調試log，看代碼執行流程。

3. 總結：缺失是否 force_paused 的debug log。

四、CPU online/offline流程

執行：echo 0/1 > /sys/devices/system/cpu/cpuX/online

相關函數

struct bus_type cpu_subsys = { //driver/base/cpu.c
    .name = "cpu",
    .dev_name = "cpu",
    .match = cpu_subsys_match,
#ifdef CONFIG_HOTPLUG_CPU
    .online = cpu_subsys_online,
    .offline = cpu_subsys_offline,
#endif
};

static ssize_t online_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) //driver/base/core.c
{
    ...
    ret = strtobool(buf, &val);
    ret = val ? device_online(dev) : device_offline(dev); 
    ...
}

調用路徑：

device_online
    dev->bus->online(dev) //也就是 cpu_subsys.online
        cpu_device_up(dev)
            cpu_up(dev->id, CPUHP_ONLINE) 
    kobject_uevent(&dev->kobj, KOBJ_ONLINE);
    dev->offline = false;

device_offline
    dev->bus->offline(dev); //也就是 cpu_subsys.offline
        cpu_device_down(dev);
            cpu_down(dev->id, CPUHP_OFFLINE)
    kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
    dev->offline = true;

struct device 結構中只有 offline 成員，沒有 online 成員。offline 調用路徑中會去判斷不會 offline 唯一 active 的 cpu，實測 offline cpu 會設置cpu_active_mask，但是追蹤代碼，暫時還沒有看到哪里設置的。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 pg_ctl - 啟動，停止和重啟 PostgreSQL 服務器 CPU調度——EAS調度器 16、SGE作業調度系統的簡介 Asp.Net Core 輕松學-基於微服務的后台任務調度管理器 .net core 任務調度 Linux調度器 - deadline調度器調度器15—uclamp jmeter之調度器設置 MapReduce調度器 Kubernetes之調度器和調度過程