WALT（Window Assisted Load Tracking）學習

本文轉載自查看原文 2020-02-26 16:53 2198 進程調度

QCOM平台使用WALT（Window Assisted Load Tracking）作為CPU load tracking的方法；相對地，ARM使用的是PELT（Per-Entity Load Tracking）。

WALT的核心算法思想是：將一小段時間內的CPU loading情況計算出對應的結果，作為一個window；然后再統計多個類似的window。通過計算，得出task demand，最后將結果運用於CPU 頻率調節，負載均衡（task遷移）。

主要代碼在walt.c中。代碼基於kernel-4.19版本。

WALT核心結構體

struct rq {
...
#ifdef CONFIG_SCHED_WALT
    struct sched_cluster    *cluster;
    struct cpumask        freq_domain_cpumask;
    struct walt_sched_stats walt_stats;
    
    u64            window_start;
    s64            cum_window_start;
    unsigned long        walt_flags;
    
    u64            cur_irqload;
    u64            avg_irqload;
    u64            irqload_ts;
    struct task_struct    *ed_task;
    struct cpu_cycle    cc;
    u64            old_busy_time, old_busy_time_group;
    u64            old_estimated_time;
    u64            curr_runnable_sum;
    u64            prev_runnable_sum;
    u64            nt_curr_runnable_sum;
    u64            nt_prev_runnable_sum;
    u64            cum_window_demand_scaled;
    struct group_cpu_time    grp_time;
    struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
    DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
            NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
    u8            *top_tasks[NUM_TRACKED_WINDOWS];
    u8            curr_table;
    int            prev_top;
    int            curr_top;
    bool            notif_pending;
    u64            last_cc_update;
    u64            cycles;
#endif /* CONFIG_SCHED_WALT */
...
}


struct task_struct {
...
#ifdef CONFIG_SCHED_WALT
    struct ravg ravg;
    /*
     * 'init_load_pct' represents the initial task load assigned to children
     * of this task
     */
    u32 init_load_pct;
    u64 last_wake_ts;
    u64 last_enqueued_ts;
    struct related_thread_group *grp;
    struct list_head grp_list;
    u64 cpu_cycles;
    bool misfit;
    u8 unfilter;
#endif
...
}


#ifdef CONFIG_SCHED_WALT
/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {
    /*
     * 'mark_start' marks the beginning of an event (task waking up, task
     * starting to execute, task being preempted) within a window
     *
     * 'sum' represents how runnable a task has been within current
     * window. It incorporates both running time and wait time and is
     * frequency scaled.
     *
     * 'sum_history' keeps track of history of 'sum' seen over previous
     * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
     * ignored.
     *
     * 'demand' represents maximum sum seen over previous
     * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
     * demand for tasks.
     *
     * 'curr_window_cpu' represents task's contribution to cpu busy time on
     * various CPUs in the current window
     *
     * 'prev_window_cpu' represents task's contribution to cpu busy time on
     * various CPUs in the previous window
     *
     * 'curr_window' represents the sum of all entries in curr_window_cpu
     *
     * 'prev_window' represents the sum of all entries in prev_window_cpu
     *
     * 'pred_demand' represents task's current predicted cpu busy time
     *
     * 'busy_buckets' groups historical busy time into different buckets
     * used for prediction
     *
     * 'demand_scaled' represents task's demand scaled to 1024
     */
    u64 mark_start;
    u32 sum, demand;
    u32 coloc_demand;
    u32 sum_history[RAVG_HIST_SIZE_MAX];
    u32 *curr_window_cpu, *prev_window_cpu;
    u32 curr_window, prev_window;
    u16 active_windows;
    u32 pred_demand;
    u8 busy_buckets[NUM_BUSY_BUCKETS];
    u16 demand_scaled;
    u16 pred_demand_scaled;
};
#endif

負載記錄

WALT中，使用demand記錄task負載

static inline unsigned long task_util(struct task_struct *p)
{
#ifdef CONFIG_SCHED_WALT
    return p->ravg.demand_scaled;　　　　//task負載 #endif return READ_ONCE(p->se.avg.util_avg); }

使用cumulative_runnable_avg_scaled記錄cpu負載

static inline unsigned long cpu_util(int cpu)
{
    struct cfs_rq *cfs_rq;
    unsigned int util;

#ifdef CONFIG_SCHED_WALT
    u64 walt_cpu_util =
        cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled;　　//cpu負載

    return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu));
#endif

    cfs_rq = &cpu_rq(cpu)->cfs;
    util = READ_ONCE(cfs_rq->avg.util_avg);

    if (sched_feat(UTIL_EST))
        util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));

    return min_t(unsigned long, util, capacity_orig_of(cpu));
}

static inline unsigned long cpu_util_cum(int cpu, int delta)
{
    u64 util = cpu_rq(cpu)->cfs.avg.util_avg;
    unsigned long capacity = capacity_orig_of(cpu);

#ifdef CONFIG_SCHED_WALT
    util = cpu_rq(cpu)->cum_window_demand_scaled;　　//處於當前運行的task util？目前還沒搞清楚
#endif
    delta += util;
    if (delta < 0)
        return 0;

    return (delta >= capacity) ? capacity : delta;
}

WALT機制觸發的時刻

WALT主要機制

　　1. task load，cpu load統計

/* Reflect task activity on its demand and cpu's busy time statistics */
void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
                        u64 wallclock, u64 irqtime)
{
    u64 old_window_start;

    if (!rq->window_start || sched_disable_window_stats ||
        p->ravg.mark_start == wallclock)　　　　//3個直接return的條件：walt算法沒有開始；臨時關閉了walt；wallclock沒更新，不用做重復工作
        return;

    lockdep_assert_held(&rq->lock);

    old_window_start = update_window_start(rq, wallclock, event);　　//這里是算法剛剛開始，所以只是獲取window start

    if (!p->ravg.mark_start) {　　　　　　　　　　　　　　　　　　//第一次進入沒有標記walt算法的開始：mark_start。那么就直接goto done；
        update_task_cpu_cycles(p, cpu_of(rq), wallclock);　　//同時，更新cpu cycle count（后續在scale_exec_time中計算cpu freq）
        goto done;
    }

    update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);　　//同上，更新cpu cycle count（后續在scale_exec_time中計算cpu freq），比上面多一個idle task的判斷處理
    update_task_demand(p, rq, event, wallclock);　　　　　　　　 //（1.）walt，更新task demand
    update_cpu_busy_time(p, rq, event, wallclock, irqtime);　　//（2.）walt，更新cpu busy time
    update_task_pred_demand(rq, p, event);　　　　　　　　　　　　//（3.）walt，更新預測的task demand

    if (exiting_task(p))　　　　　　//exiting task的情況下，不記錄trace log
        goto done;

　　//trace logs
    trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
                rq->cc.cycles, rq->cc.time, &rq->grp_time);
    trace_sched_update_task_ravg_mini(p, rq, event, wallclock, irqtime,
                rq->cc.cycles, rq->cc.time, &rq->grp_time);

done:
    p->ravg.mark_start = wallclock; 　　　　　　　　　　//更新mark_start，記錄下一次walt算法開始的時間

    run_walt_irq_work(old_window_start, rq);　　　　 //（4.）walt，針對irq情況的處理
}

1. 統計cpu task的demand AND/OR 更新對應cpu的demand history。

注釋主要講了3種（a、b、c）可能情況下的ravg.sum負載統計方法。都是wallclock-mark_start歸一化時間，對於irqtime為1的情況可以看code稍有不同，但是原理是類似的。

/*
 * Account cpu demand of task and/or update task's cpu demand history
 *
 * ms = p->ravg.mark_start;
 * wc = wallclock
 * ws = rq->window_start
 *
 * Three possibilities:
 *
 *    a) Task event is contained within one window.
 *        window_start < mark_start < wallclock
 *
 *        ws   ms  wc
 *        |    |   |
 *        V    V   V
 *        |---------------|
 *
 *    In this case, p->ravg.sum is updated *iff* event is appropriate
 *    (ex: event == PUT_PREV_TASK)
 *
 *    b) Task event spans two windows.
 *        mark_start < window_start < wallclock
 *
 *        ms   ws   wc
 *        |    |    |
 *        V    V    V
 *        -----|-------------------
 *
 *    In this case, p->ravg.sum is updated with (ws - ms) *iff* event
 *    is appropriate, then a new window sample is recorded followed
 *    by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
 *
 *    c) Task event spans more than two windows.
 *
 *        ms ws_tmp               ws  wc
 *        |  |                   |   |
 *        V  V                   V   V
 *        ---|-------|-------|-------|-------|------
 *           |                   |
 *           |<------ nr_full_windows ------>|
 *
 *    In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
 *    event is appropriate, window sample of p->ravg.sum is recorded,
 *    'nr_full_window' samples of window_size is also recorded *iff*
 *    event is appropriate and finally p->ravg.sum is set to (wc - ws)
 *    *iff* event is appropriate.
 *
 * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
 * depends on it!
 */
static u64 update_task_demand(struct task_struct *p, struct rq *rq,
                   int event, u64 wallclock)
{
    u64 mark_start = p->ravg.mark_start;
    u64 delta, window_start = rq->window_start;
    int new_window, nr_full_windows;
    u32 window_size = sched_ravg_window;
    u64 runtime;

    new_window = mark_start < window_start;　　　　　　//當new_window=1時，情況為b、c
    if (!account_busy_for_task_demand(rq, p, event)) {　　//判斷該task是否會工作導致busy
        if (new_window)
            /*
             * If the time accounted isn't being accounted as
             * busy time, and a new window started, only the
             * previous window need be closed out with the
             * pre-existing demand. Multiple windows may have
             * elapsed, but since empty windows are dropped,
             * it is not necessary to account those.　　　　　　//如果不導致busy，並且有new window了，那么說明繁忙的工作已經完成了，很多有數據的window已經流逝了，並且空閑的window被drop了，所以，只需要調用update。
             */
            update_history(rq, p, p->ravg.sum, 1, event);　　//（1.1）update上次的計算結果,長度：p->ravg.sum
        return 0;
    }

    if (!new_window) {
        /*
         * The simple case - busy time contained within the existing
         * window.
         */
        return add_to_task_demand(rq, p, wallclock - mark_start);　　//（1.2）情況a：最簡單的情況，直接更新wallclock - mark_start
    }

    /*
     * Busy time spans at least two windows. Temporarily rewind
     * window_start to first window boundary after mark_start.
     */
    delta = window_start - mark_start;　　　　　　　　　　　　　　　　   //busy time跨越至少2個window時，先臨時將window_start移動到mark_start緊接着后面的window邊界處，記作ws_tmp
    nr_full_windows = div64_u64(delta, window_size);　　　　　　　　　　
    window_start -= (u64)nr_full_windows * (u64)window_size;　　　　

    /* Process (window_start - mark_start) first */
    runtime = add_to_task_demand(rq, p, window_start - mark_start);　　//先計算ws_tmp - mark_start

    /* Push new sample(s) into task's demand history */
    update_history(rq, p, p->ravg.sum, 1, event);　　　　　　　　　　　　 //更新history，長度：p->ravg.sum
    if (nr_full_windows) {　　　　　　　　　　　　　　　　　　　　　　　　　　//如上面，其中有遺留的完整window，也需要更新，長度：nr_full_windows * scaled_window; scaled_window由window_size，cpu freq等參數計算轉化而來
        u64 scaled_window = scale_exec_time(window_size, rq);

        update_history(rq, p, scaled_window, nr_full_windows, event);
        runtime += nr_full_windows * scaled_window;
    }

    /*
     * Roll window_start back to current to process any remainder
     * in current window.
     */
    window_start += (u64)nr_full_windows * (u64)window_size;　　　　　　//將window_start重新從ws_tmp處移回原先的地方

    /* Process (wallclock - window_start) next */
    mark_start = window_start;
    runtime += add_to_task_demand(rq, p, wallclock - mark_start);　　//在計算wallclock - mark_start（其實就是window_start）

    return runtime;　　　　　　　　　　　　　　//runtime記錄的是task累計的時間
}

1.1 更新history, 這里就會對history中的window值求平均，再根據policy來選擇最近的值 or 是最大值，還是平均值，還是最大與最近值中較大的值（默認）

/*
 * Called when new window is starting for a task, to record cpu usage over
 * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
 * when, say, a real-time task runs without preemption for several windows at a
 * stretch.
 */
static void update_history(struct rq *rq, struct task_struct *p,
             u32 runtime, int samples, int event)
{
    u32 *hist = &p->ravg.sum_history[0];
    int ridx, widx;
    u32 max = 0, avg, demand, pred_demand;
    u64 sum = 0;
    u16 demand_scaled, pred_demand_scaled;

    /* Ignore windows where task had no activity */
    if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)　　//如果沒有活動的task，例如：idle，exiting等，就不用計算
        goto done;

    /* Push new 'runtime' value onto stack */
    widx = sched_ravg_hist_size - 1;
    ridx = widx - samples;
    for (; ridx >= 0; --widx, --ridx) {　　　　　　　　　　　　//這個循環，主要將hist數組中較久的數據去除，留下較新的數據。留下的數據累計到sum中
        hist[widx] = hist[ridx];
        sum += hist[widx];
        if (hist[widx] > max)
            max = hist[widx];
    }

    for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {　　//這個循環，主要將新的數據填充點hist數組中，並累計到sum中
        hist[widx] = runtime;
        sum += hist[widx];
        if (hist[widx] > max)
            max = hist[widx];
    }

    p->ravg.sum = 0;

    if (sysctl_sched_window_stats_policy == WINDOW_STATS_RECENT) {　　　　　　//根據policy選擇demand的計算方式，默認policy為WINDOW_STATS_MAX_RECENT_AVG=2
        demand = runtime;
    } else if (sysctl_sched_window_stats_policy == WINDOW_STATS_MAX) {
        demand = max;
    } else {
        avg = div64_u64(sum, sched_ravg_hist_size);　　　　　　　　　　//計算avg均值
        if (sysctl_sched_window_stats_policy == WINDOW_STATS_AVG)
            demand = avg;
        else
            demand = max(avg, runtime);　　　　//默認的policy
    }
    pred_demand = predict_and_update_buckets(rq, p, runtime);　　　　//（1.1.1）根據當前的數據，計算預測的demand
    demand_scaled = scale_demand(demand);　　　　　　　　　　　　　　　　//歸一化demand
    pred_demand_scaled = scale_demand(pred_demand);　　　　　　　　　　//歸一化預測的demand

    /*
     * A throttled deadline sched class task gets dequeued without
     * changing p->on_rq. Since the dequeue decrements walt stats
     * avoid decrementing it here again.
     *
     * When window is rolled over, the cumulative window demand
     * is reset to the cumulative runnable average (contribution from
     * the tasks on the runqueue). If the current task is dequeued
     * already, it's demand is not included in the cumulative runnable
     * average. So add the task demand separately to cumulative window
     * demand.
     */

　　　　/* 上面這段話的目的是校正參數，分兩種情況。第一種，task在rq queue里面，上次task
　　　　　　的demand為x，本次計算為y，則cpu負載:cumulative_runnable_avg_scaled += (y-x)。
　　　　　　第二種情況task不在rq queue里面，並且當前task是本次計算demand的task,則直接計算
　　　　　　window load，cum_window_demand_scaled += y;

　　　　　　總結上面一句話：新task，它的demand直接累加到累計的demand變量中；而原task的demand發生變化，那么就要把該task的增減量delta，更新到累計的demand變量中。

　　　　　　****這個cum_window_demand_scaled數據和cumulative_runnable_avg_scaled是體現cpu untilization的一種體現***
　　　　*/

    if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {　　　　
        if (task_on_rq_queued(p) &&
                p->sched_class->fixup_walt_sched_stats)
            p->sched_class->fixup_walt_sched_stats(rq, p,
                    demand_scaled, pred_demand_scaled);
        else if (rq->curr == p)
            walt_fixup_cum_window_demand(rq, demand_scaled);
    }

    p->ravg.demand = demand;　　　　　　　　　　　　　　　　　　　　　　　　//更新ravg結構體中相關數據參數
    p->ravg.demand_scaled = demand_scaled;
    p->ravg.coloc_demand = div64_u64(sum, sched_ravg_hist_size);　　//這里有點沒明白，為什么又作一次計算，更新colocation demand。這個demand，肯定是更實時。
    p->ravg.pred_demand = pred_demand;
    p->ravg.pred_demand_scaled = pred_demand_scaled;

    if (demand_scaled > sched_task_filter_util)　　　　　　　　　　　　//demand_scaled > 35(0.68ms, default for 20ms window size scaled to 1024)
        p->unfilter = sysctl_sched_task_unfilter_nr_windows;　　　　//如果demand超過了，那么就放開task遷移到更大的cpu的其一限制（限制條件不只這一條，這只是其中之一），並維持10次cnt
    else
        if (p->unfilter)
            p->unfilter = p->unfilter - 1;　　　　　　　　　　　　　　 //沒達到，就會cnt減1。為0后，task就會不滿足up migrate的條件

done:
    trace_sched_update_history(rq, p, runtime, samples, event);　　//trace log
}

1.1.1 預測demand，並更新buckets

static inline u32 predict_and_update_buckets(struct rq *rq,
            struct task_struct *p, u32 runtime) {

    int bidx;
    u32 pred_demand;

    if (!sched_predl)
        return 0;

    bidx = busy_to_bucket(runtime);　　　　　　　　　　　　　　//將runtime桶化成busy的等級：1～9，數字越大，越busy
    pred_demand = get_pred_busy(rq, p, bidx, runtime);　　 //計算預測demand，看下面的詳細解析
    bucket_increase(p->ravg.busy_buckets, bidx);　　　　　  //更新bucket，詳細解析看下面

    return pred_demand;
}

計算預測demand（預測的demand主要用於EAS）

/*
 * get_pred_busy - calculate predicted demand for a task on runqueue
 *
 * @rq: runqueue of task p
 * @p: task whose prediction is being updated
 * @start: starting bucket. returned prediction should not be lower than
 *         this bucket.
 * @runtime: runtime of the task. returned prediction should not be lower
 *           than this runtime.
 * Note: @start can be derived from @runtime. It's passed in only to
 * avoid duplicated calculation in some cases.
 *
 * A new predicted busy time is returned for task @p based on @runtime
 * passed in. The function searches through buckets that represent busy
 * time equal to or bigger than @runtime and attempts to find the bucket to
 * to use for prediction. Once found, it searches through historical busy
 * time and returns the latest that falls into the bucket. If no such busy
 * time exists, it returns the medium of that bucket.
 */
static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
                int start, u32 runtime)
{
    int i;
    u8 *buckets = p->ravg.busy_buckets;
    u32 *hist = p->ravg.sum_history;
    u32 dmin, dmax;
    u64 cur_freq_runtime = 0;
    int first = NUM_BUSY_BUCKETS, final;
    u32 ret = runtime;

    /* skip prediction for new tasks due to lack of history */
    if (unlikely(is_new_task(p)))　　　　　　　　　　　　　　　　　　//new task沒有history，所以不用作預測
        goto out;

    /* find minimal bucket index to pick */
    for (i = start; i < NUM_BUSY_BUCKETS; i++) {　　　　　　　　//找到第一個非0值bucket的index
        if (buckets[i]) {
            first = i;
            break;
        }
    }
    /* if no higher buckets are filled, predict runtime */
    if (first >= NUM_BUSY_BUCKETS)　　　　　　　　　　　　　　　　//這個條件應該永遠不會滿足，因為index最大為9
        goto out;

    /* compute the bucket for prediction */
    final = first;

    /* determine demand range for the predicted bucket */
    if (final < 2) {　　　　　　　　　　　　　　　　　　　　　　　　//如果index是最小的0、1，那么就直接設為1。因為不可能出現比最小還小
        /* lowest two buckets are combined */
        dmin = 0;
        final = 1;
    } else {
        dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);　　//反向計算，還原final index對應runtime
    }
    dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);　　//反向計算，還原final+1 index對應的runtime

    /*
     * search through runtime history and return first runtime that falls
     * into the range of predicted bucket.
     */
    for (i = 0; i < sched_ravg_hist_size; i++) {　　　　　　　　　　//在history中尋找，最新的一個能滿足runtime區間的記錄
        if (hist[i] >= dmin && hist[i] < dmax) {
            ret = hist[i];
            break;
        }
    }
    /* no historical runtime within bucket found, use average of the bin */
    if (ret < dmin)　　　　　　　　　　　　　　　　　　　　　　　　　　　//沒有找到，那么使用區間的中值
        ret = (dmin + dmax) / 2;
    /*
     * when updating in middle of a window, runtime could be higher
     * than all recorded history. Always predict at least runtime.
     */
    ret = max(runtime, ret);　　　　　　　　　　　　　　　　　　　　　//保持預測的值不小於原先的runtime
out:
    trace_sched_update_pred_demand(rq, p, runtime,
        mult_frac((unsigned int)cur_freq_runtime, 100,
              sched_ravg_window), ret);
    return ret;
}

bucket_increase用於更新bucket，如果index匹配，那么就會增加8/16（small step/big step），但不會超過最大255。

如果index不匹配，那么就會自動進行衰減2，直到減為0。

#define INC_STEP 8
#define DEC_STEP 2
#define CONSISTENT_THRES 16
#define INC_STEP_BIG 16
/*
 * bucket_increase - update the count of all buckets
 *
 * @buckets: array of buckets tracking busy time of a task
 * @idx: the index of bucket to be incremented
 *
 * Each time a complete window finishes, count of bucket that runtime
 * falls in (@idx) is incremented. Counts of all other buckets are
 * decayed. The rate of increase and decay could be different based
 * on current count in the bucket.
 */
static inline void bucket_increase(u8 *buckets, int idx)
{
    int i, step;

    for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
        if (idx != i) {
            if (buckets[i] > DEC_STEP)
                buckets[i] -= DEC_STEP;
            else
                buckets[i] = 0;
        } else {
            step = buckets[i] >= CONSISTENT_THRES ?
                        INC_STEP_BIG : INC_STEP;
            if (buckets[i] > U8_MAX - step)
                buckets[i] = U8_MAX;
            else
                buckets[i] += step;
        }
    }
}

——————————

1.2 add_to_task_demand比較簡單，就是將task運行占用的時間歸一化，然后累計到ravg.sum，但是ravg.sum不能超過sched_ravg_window（20ms）。

static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
{
    delta = scale_exec_time(delta, rq);　　//這里就會用到之前的cpu cycle count
    p->ravg.sum += delta;
    if (unlikely(p->ravg.sum > sched_ravg_window))
        p->ravg.sum = sched_ravg_window;

    return delta;
}

=================

2. 在cpu活動時，更新cpu busy time（rq->curr/prev_runnable_sum）

/*
 * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
 */
static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
                 int event, u64 wallclock, u64 irqtime)
{
    int new_window, full_window = 0;
    int p_is_curr_task = (p == rq->curr);
    u64 mark_start = p->ravg.mark_start;
    u64 window_start = rq->window_start;
    u32 window_size = sched_ravg_window;
    u64 delta;
    u64 *curr_runnable_sum = &rq->curr_runnable_sum;
    u64 *prev_runnable_sum = &rq->prev_runnable_sum;
    u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
    u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
    bool new_task;
    struct related_thread_group *grp;
    int cpu = rq->cpu;
    u32 old_curr_window = p->ravg.curr_window;

    new_window = mark_start < window_start;
    if (new_window) {
        full_window = (window_start - mark_start) >= window_size;　　//full window代表距離上次更新較久了（>20ms）
        if (p->ravg.active_windows < USHRT_MAX)
            p->ravg.active_windows++;
    }

    new_task = is_new_task(p);　　　　　　　　　　//更具ravg.active_windows判斷是否是<5,則便是原先空的window，現在有新task填充。僅發生在剛開始初始化階段

    /*
     * Handle per-task window rollover. We don't care about the idle
     * task or exiting tasks.
     */
    if (!is_idle_task(p) && !exiting_task(p)) {　　　　　　　　//idle、exiting task的rollover，無需考慮
        if (new_window)
            rollover_task_window(p, full_window);　　　　　　//task rollover實際就是轉存下window，把curr存到pre。並且轉存per cpu的prev_window_cpu[]
    }

    if (p_is_curr_task && new_window) {
        rollover_cpu_window(rq, full_window);　　　　　　　　//轉存rq、rq->grp_time相關的curr_runnable_sum、nt_curr_runnable_sum
        rollover_top_tasks(rq, full_window);　　　　　　　　 //轉存top_tass_table和curr_top task
    }

    if (!account_busy_for_cpu_time(rq, p, irqtime, event))　　//非busy（可能正在migrate，idle等）則直接update top tasks
        goto done;

    grp = p->grp;
    if (grp) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　//如果task有releated_thread_group，那么使用grp_time的curr_runnable_sum和nt_curr_runnable_sum
        struct group_cpu_time *cpu_time = &rq->grp_time;
        curr_runnable_sum = &cpu_time->curr_runnable_sum;
        prev_runnable_sum = &cpu_time->prev_runnable_sum;

        nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
        nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
    }

    if (!new_window) {
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. No rollover
         * since we didn't start a new window. An example of this is
         * when a task starts execution and then sleeps within the
         * same window.　　　　***task執行，然后接着就sleep；動作發生在通一個window中，這個情況下不需要rollover***
         */

        if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
            delta = wallclock - mark_start;　　　　　　　　　　　　　　　　//非中斷、idle task、cpu等待io的情況下，delta的值
        else
            delta = irqtime;　　　　　　　　　　　　　　　　　　　　　　　　　//中斷、idle task、cpu等待io的情況下，delta的值
        delta = scale_exec_time(delta, rq);
        *curr_runnable_sum += delta;　　　　　　　　　　　　//歸一化后，統計到curr_runnable_sum
        if (new_task)
            *nt_curr_runnable_sum += delta;　　　　　　　　//如果是new task，也累加到nt_curr_rannable_sum中

        if (!is_idle_task(p) && !exiting_task(p)) {
            p->ravg.curr_window += delta;　　　　　　　　  //更新curr_window和對應cpu的數組curr_window_cpu[cpu]
            p->ravg.curr_window_cpu[cpu] += delta;　　　　
        }

        goto done;
    }

    if (!p_is_curr_task) {
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. A new window
         * has also started, but p is not the current task, so the
         * window is not rolled over - just split up and account
         * as necessary into curr and prev. The window is only
         * rolled over when a new window is processed for the current
         * task.
         *
         * Irqtime can't be accounted by a task that isn't the
         * currently running task.　　　　***p不是current task的情況，irqtime不能被統計到p***
         */

        if (!full_window) {
            /*
             * A full window hasn't elapsed, account partial
             * contribution to previous completed window.  　　***沒有出現過了full window的時候，僅更新到prev_window***
             */
            delta = scale_exec_time(window_start - mark_start, rq);　　//更新的大小：window_start - mark_start
            if (!exiting_task(p)) {
                p->ravg.prev_window += delta;
                p->ravg.prev_window_cpu[cpu] += delta;
            }
        } else {
            /*
             * Since at least one full window has elapsed,
             * the contribution to the previous window is the
             * full window (window_size).
             */
            delta = scale_exec_time(window_size, rq);　　　　　　　　//出現了full window的情況下，更新的大小：window_size，20ms
            if (!exiting_task(p)) {
                p->ravg.prev_window = delta;
                p->ravg.prev_window_cpu[cpu] = delta;
            }
        }

        *prev_runnable_sum += delta;　　　　　　　　　　　　　　//再更新prev_runnable_sum和nt_prev_runnable_sum
        if (new_task)
            *nt_prev_runnable_sum += delta;

        /* Account piece of busy time in the current window. */
        delta = scale_exec_time(wallclock - window_start, rq);　　//再統計curr window到curr_runnable_sum和nt_curr_runnable_sum
        *curr_runnable_sum += delta;
        if (new_task)
            *nt_curr_runnable_sum += delta;

        if (!exiting_task(p)) {　　　　　　　　　　　　　　　　　　//並更新curr_window和對應cpu的數組curr_window_cpu[cpu]
            p->ravg.curr_window = delta;
            p->ravg.curr_window_cpu[cpu] = delta;
        }

        goto done;
    }

    if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. A new window
         * has started and p is the current task so rollover is
         * needed. If any of these three above conditions are true
         * then this busy time can't be accounted as irqtime.
         *
         * Busy time for the idle task or exiting tasks need not
         * be accounted.
         *
         * An example of this would be a task that starts execution
         * and then sleeps once a new window has begun.　　　　***一個task開始執行，然后在1個new window開始時，sleep了（也是類似統計curr_rannable_sum等）***
         */

        if (!full_window) {
            /*
             * A full window hasn't elapsed, account partial
             * contribution to previous completed window.
             */
            delta = scale_exec_time(window_start - mark_start, rq);
            if (!is_idle_task(p) && !exiting_task(p)) {
                p->ravg.prev_window += delta;
                p->ravg.prev_window_cpu[cpu] += delta;
            }
        } else {
            /*
             * Since at least one full window has elapsed,
             * the contribution to the previous window is the
             * full window (window_size).
             */
            delta = scale_exec_time(window_size, rq);
            if (!is_idle_task(p) && !exiting_task(p)) {
                p->ravg.prev_window = delta;
                p->ravg.prev_window_cpu[cpu] = delta;
            }
        }

        /*
         * Rollover is done here by overwriting the values in
         * prev_runnable_sum and curr_runnable_sum.
         */
        *prev_runnable_sum += delta;
        if (new_task)
            *nt_prev_runnable_sum += delta;

        /* Account piece of busy time in the current window. */
        delta = scale_exec_time(wallclock - window_start, rq);
        *curr_runnable_sum += delta;
        if (new_task)
            *nt_curr_runnable_sum += delta;

        if (!is_idle_task(p) && !exiting_task(p)) {
            p->ravg.curr_window = delta;
            p->ravg.curr_window_cpu[cpu] = delta;
        }

        goto done;
    }

    if (irqtime) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　//scheduler_tick函數觸發的時候，irqtime=0
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. A new window
         * has started and p is the current task so rollover is
         * needed. The current task must be the idle task because
         * irqtime is not accounted for any other task.
         *
         * Irqtime will be accounted each time we process IRQ activity
         * after a period of idleness, so we know the IRQ busy time
         * started at wallclock - irqtime.　　　　***irq發生的情況下，統計busy time（也是類似統計curr_rannable_sum等）***
         */

        BUG_ON(!is_idle_task(p));
        mark_start = wallclock - irqtime;

        /*
         * Roll window over. If IRQ busy time was just in the current
         * window then that is all that need be accounted.
         */
        if (mark_start > window_start) {
            *curr_runnable_sum = scale_exec_time(irqtime, rq);
            return;
        }

        /*
         * The IRQ busy time spanned multiple windows. Process the
         * busy time preceding the current window start first.
         */
        delta = window_start - mark_start;
        if (delta > window_size)
            delta = window_size;
        delta = scale_exec_time(delta, rq);
        *prev_runnable_sum += delta;

        /* Process the remaining IRQ busy time in the current window. */
        delta = wallclock - window_start;
        rq->curr_runnable_sum = scale_exec_time(delta, rq);

        return;
    }

done:
    if (!is_idle_task(p) && !exiting_task(p))
        update_top_tasks(p, rq, old_curr_window,
                    new_window, full_window);　　　　　　(2.1)更新cpu top task
}

2.1 更新top task，維護curr_table/prev_table

static void update_top_tasks(struct task_struct *p, struct rq *rq,
        u32 old_curr_window, int new_window, bool full_window)
{
    u8 curr = rq->curr_table;
    u8 prev = 1 - curr;
    u8 *curr_table = rq->top_tasks[curr];
    u8 *prev_table = rq->top_tasks[prev];
    int old_index, new_index, update_index;
    u32 curr_window = p->ravg.curr_window;
    u32 prev_window = p->ravg.prev_window;
    bool zero_index_update;

    if (old_curr_window == curr_window && !new_window)
        return;

    old_index = load_to_index(old_curr_window);　　　　　　//把load轉化為index
    new_index = load_to_index(curr_window);

    if (!new_window) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　//在沒有new window的情況下，更新當前top表rq->curr_table[]中新舊index的計數,
        zero_index_update = !old_curr_window && curr_window;　　　　　　　//根據curr_table[新舊index]的計數，更新rq->top_tasks_bitmap[curr] bitmap中對應index的值
        if (old_index != new_index || zero_index_update) {
            if (old_curr_window)
                curr_table[old_index] -= 1;
            if (curr_window)
                curr_table[new_index] += 1;
            if (new_index > rq->curr_top)
                rq->curr_top = new_index;
        }

        if (!curr_table[old_index])
            __clear_bit(NUM_LOAD_INDICES - old_index - 1,
                rq->top_tasks_bitmap[curr]);

        if (curr_table[new_index] == 1)
            __set_bit(NUM_LOAD_INDICES - new_index - 1,
                rq->top_tasks_bitmap[curr]);

        return;
    }

    /*
     * The window has rolled over for this task. By the time we get
     * here, curr/prev swaps would has already occurred. So we need
     * to use prev_window for the new index.　　　　　　　　　　　　***有new window的情況下，分2部分計算，一部分時prev_window；另一部分時curr_window***
     */
    update_index = load_to_index(prev_window);

    if (full_window) {
        /*
         * Two cases here. Either 'p' ran for the entire window or
         * it didn't run at all. In either case there is no entry
         * in the prev table. If 'p' ran the entire window, we just
         * need to create a new entry in the prev table. In this case
         * update_index will be correspond to sched_ravg_window
         * so we can unconditionally update the top index.
         */
        if (prev_window) {
            prev_table[update_index] += 1;
            rq->prev_top = update_index;
        }

        if (prev_table[update_index] == 1)
            __set_bit(NUM_LOAD_INDICES - update_index - 1,
                rq->top_tasks_bitmap[prev]);
    } else {
        zero_index_update = !old_curr_window && prev_window;
        if (old_index != update_index || zero_index_update) {
            if (old_curr_window)
                prev_table[old_index] -= 1;

            prev_table[update_index] += 1;

            if (update_index > rq->prev_top)
                rq->prev_top = update_index;

            if (!prev_table[old_index])
                __clear_bit(NUM_LOAD_INDICES - old_index - 1,
                        rq->top_tasks_bitmap[prev]);

            if (prev_table[update_index] == 1)
                __set_bit(NUM_LOAD_INDICES - update_index - 1,
                        rq->top_tasks_bitmap[prev]);
        }
    }

    if (curr_window) {
        curr_table[new_index] += 1;

        if (new_index > rq->curr_top)
            rq->curr_top = new_index;

        if (curr_table[new_index] == 1)
            __set_bit(NUM_LOAD_INDICES - new_index - 1,
                rq->top_tasks_bitmap[curr]);
    }
}

=================

3. 在window滾動期間，當task busy time超過了預測的demand，那么就要更新預測的demand。

/*
 * predictive demand of a task is calculated at the window roll-over.
 * if the task current window busy time exceeds the predicted
 * demand, update it here to reflect the task needs.
 */
void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
{
    u32 new, old;
    u16 new_scaled;

    if (!sched_predl)
        return;

    if (is_idle_task(p) || exiting_task(p))
        return;

    if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
            (!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
             (event != TASK_MIGRATE &&
             event != PICK_NEXT_TASK)))
        return;

    /*
     * TASK_UPDATE can be called on sleeping task, when its moved between
     * related groups
     */
    if (event == TASK_UPDATE) {
        if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
            return;
    }

    new = calc_pred_demand(rq, p);　　//計算新的demand，計算方法與1.1.1的get_pred_busy的一樣
    old = p->ravg.pred_demand;

    if (old >= new)　　　　　　　　　　//不需要更新就return
        return;

    new_scaled = scale_demand(new);
    if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||　　　　　　　　　　//更新cum_window_demand_scaled，與1.1中一樣
                !p->dl.dl_throttled) &&
                p->sched_class->fixup_walt_sched_stats)
        p->sched_class->fixup_walt_sched_stats(rq, p,
                p->ravg.demand_scaled,
                new_scaled);

    p->ravg.pred_demand = new;　　　　　　　　　　　　　　//更新預測的pred_demand和pred_demand_sclaed
    p->ravg.pred_demand_scaled = new_scaled;
}

=================

4. 並且根據判斷是否需要調節cpu freq。如果是task遷移，還需要判斷是否wake up cluster/core。

static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq)
{
    u64 result;

    if (old_window_start == rq->window_start)　　　　　　//過濾，防止循環調用
        return;

    result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start,
                   rq->window_start);
    if (result == old_window_start)
        irq_work_queue(&walt_cpufreq_irq_work);　　//調用walt_irq_work
}

static void walt_init_once(void)
{
...
    init_irq_work(&walt_cpufreq_irq_work, walt_irq_work);
...
}

/*
 * Runs in hard-irq context. This should ideally run just after the latest
 * window roll-over.
 */
void walt_irq_work(struct irq_work *irq_work)
{
    struct sched_cluster *cluster;
    struct rq *rq;
    int cpu;
    u64 wc;
    bool is_migration = false, is_asym_migration = false;
    u64 total_grp_load = 0, min_cluster_grp_load = 0;
    int level = 0;

    /* Am I the window rollover work or the migration work? */
    if (irq_work == &walt_migration_irq_work)
        is_migration = true;

    for_each_cpu(cpu, cpu_possible_mask) {
        if (level == 0)
            raw_spin_lock(&cpu_rq(cpu)->lock);
        else
            raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
        level++;
    }

    wc = sched_ktime_clock();
    walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws);
    for_each_sched_cluster(cluster) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　//遍歷每個cluster、每個cpu
        u64 aggr_grp_load = 0;

        raw_spin_lock(&cluster->load_lock);

        for_each_cpu(cpu, &cluster->cpus) {
            rq = cpu_rq(cpu);
            if (rq->curr) {
                update_task_ravg(rq->curr, rq,　　　　　　　　　　　　　　　　　　//調用update_task_tavg，更新
                        TASK_UPDATE, wc, 0);
                account_load_subtractions(rq);　　　　　　　　　　　　　　　　　　//統計curr/prev_runnable_sum、nt_curr/prev_runnable_sum減去load_subtracion；衰減可能時為了防止參數一直往上漲
                aggr_grp_load += rq->grp_time.prev_runnable_sum;　　　　　　　//統計cluster的load
            }
            if (is_migration && rq->notif_pending &&
                cpumask_test_cpu(cpu, &asym_cap_sibling_cpus)) {
                is_asym_migration = true;
                rq->notif_pending = false;
            }
        }

        cluster->aggr_grp_load = aggr_grp_load;
        total_grp_load += aggr_grp_load;　　　　　　　　　　//統計總load

        if (is_min_capacity_cluster(cluster))
            min_cluster_grp_load = aggr_grp_load;
        raw_spin_unlock(&cluster->load_lock);
    }

    if (total_grp_load) {
        if (cpumask_weight(&asym_cap_sibling_cpus)) {
            u64 big_grp_load =
                      total_grp_load - min_cluster_grp_load;

            for_each_cpu(cpu, &asym_cap_sibling_cpus)
                cpu_cluster(cpu)->aggr_grp_load = big_grp_load;
        }
        rtgb_active = is_rtgb_active();
    } else {
        rtgb_active = false;
    }

    if (!is_migration && sysctl_sched_user_hint && time_after(jiffies,
                    sched_user_hint_reset_time))
        sysctl_sched_user_hint = 0;

    for_each_sched_cluster(cluster) {
        cpumask_t cluster_online_cpus;
        unsigned int num_cpus, i = 1;

        cpumask_and(&cluster_online_cpus, &cluster->cpus,
                        cpu_online_mask);
        num_cpus = cpumask_weight(&cluster_online_cpus);
        for_each_cpu(cpu, &cluster_online_cpus) {
            int flag = SCHED_CPUFREQ_WALT;

            rq = cpu_rq(cpu);

            if (is_migration) {
                if (rq->notif_pending) {
                    flag |= SCHED_CPUFREQ_INTERCLUSTER_MIG;
                    rq->notif_pending = false;
                }
            }

            if (is_asym_migration && cpumask_test_cpu(cpu,
                            &asym_cap_sibling_cpus))
                flag |= SCHED_CPUFREQ_INTERCLUSTER_MIG;

            if (i == num_cpus)
                cpufreq_update_util(cpu_rq(cpu), flag);　　//調節cpu freq
            else
                cpufreq_update_util(cpu_rq(cpu), flag |　　//flag：維持cpu freq
                            SCHED_CPUFREQ_CONTINUE);
            i++;
        }
    }

    for_each_cpu(cpu, cpu_possible_mask)
        raw_spin_unlock(&cpu_rq(cpu)->lock);

    if (!is_migration)
        core_ctl_check(this_rq()->window_start);　　//task遷移的話，要確認是否需要wake up cluster/core
}

　　2. IRQ load統計

在irq觸發時，會調用到函數：irqtime_account_irq

/*
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
 */
void irqtime_account_irq(struct task_struct *curr)
{
...
#ifdef CONFIG_SCHED_WALT
    u64 wallclock;
    bool account = true;
#endif
...
#ifdef CONFIG_SCHED_WALT
    wallclock = sched_clock_cpu(cpu);
#endif
    delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
    irqtime->irq_start_time += delta;

    /*
     * We do not account for softirq time from ksoftirqd here.
     * We want to continue accounting softirq time to ksoftirqd thread
     * in that case, so as not to confuse scheduler with a special task
     * that do not consume any time, but still wants to run.　　　　***我們不想統計從ksoftirqd跑到這里的softirq時間。但仍然會基線統計跑到ksoftirqd線程的softirq時間***
     */
    if (hardirq_count())
        irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
    else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
        irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
#ifdef CONFIG_SCHED_WALT
    else
        account = false;

    if (account)
        sched_account_irqtime(cpu, curr, delta, wallclock);　　//統計irqtime
    else if (curr != this_cpu_ksoftirqd())
        sched_account_irqstart(cpu, curr, wallclock);
#endif
}

其中delta是irq運行的時間，因為delta原先數值是irq開始時間到執行函數irqtime_account_irq的時間差值，現在執行到sched_account_irqtime函數，由於中間經過了很多代碼指令的執行，再次校正delta數值：delta += sched_clock - wallclock(上次系統時間)

void sched_account_irqtime(int cpu, struct task_struct *curr,
                 u64 delta, u64 wallclock)
{
    struct rq *rq = cpu_rq(cpu);
    unsigned long flags, nr_windows;
    u64 cur_jiffies_ts;

    raw_spin_lock_irqsave(&rq->lock, flags);

    /*
     * cputime (wallclock) uses sched_clock so use the same here for
     * consistency.
     */
    delta += sched_clock() - wallclock;　　　　　　//更新跑到這里的irqtime
    cur_jiffies_ts = get_jiffies_64();

    if (is_idle_task(curr))　　　　　　　　　　　　　　　　　　　　　　　　　//如果變為idle了，更新task load/cpu load等。
        update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
                 delta);

    nr_windows = cur_jiffies_ts - rq->irqload_ts;　　　　　　　　//計算當前irq的時間距離上次irq的間隔jiffies

    if (nr_windows) {
        if (nr_windows < 10) {　　　　　　　　　　　　　　　　　　　　//如果間隔時間小於10個window，就要衰減為原先的3/4：avg_irq_load = 原先avg_irqload * 0.75
            /* Decay CPU's irqload by 3/4 for each window. */
            rq->avg_irqload *= (3 * nr_windows);
            rq->avg_irqload = div64_u64(rq->avg_irqload,
                            4 * nr_windows);
        } else {
            rq->avg_irqload = 0;　　　　　　　　//間隔>=10個window，avg_irq_load = 0（總結：如果一個rq上的cpu irq中斷時間間隔比較長，那么它的avg_irqload就可以忽略不計）
        }
        rq->avg_irqload += rq->cur_irqload;　　//統計好cur_irqload，清0，為了下面更新此值
        rq->cur_irqload = 0;
    }

    rq->cur_irqload += delta;　　　　　　　　　　//更新cur_irqload
    rq->irqload_ts = cur_jiffies_ts;　　　　　　//更新irqload時間戳
    raw_spin_unlock_irqrestore(&rq->lock, flags);
}

irqload是否high的判斷：

__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);

static inline int sched_cpu_high_irqload(int cpu)
{
    return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;　　//10ms
}

#define SCHED_HIGH_IRQ_TIMEOUT 3

static inline u64 sched_irqload(int cpu)
{
    struct rq *rq = cpu_rq(cpu);
    s64 delta;

    delta = get_jiffies_64() - rq->irqload_ts;
    /*
     * Current context can be preempted by irq and rq->irqload_ts can be
     * updated by irq context so that delta can be negative.
     * But this is okay and we can safely return as this means there
     * was recent irq occurrence.
     */

    if (delta < SCHED_HIGH_IRQ_TIMEOUT)　　　　//如果當前時間距離上次irq間隔>=3（應該是3個tick），則認為irq load為0
        return rq->avg_irqload;
    else
        return 0;
}

high irqload影響EAS：

調用路徑：find_energy_efficient_cpu() --> find_best_target() --> sched_cpu_high_irqload()

路徑是為了在sched domain里面找到最佳的cpu，之后將task遷移過去；如果 cpu irqload為 high，那么則說明此cpu不合適，繼續遍歷其他cpu。這個也是負載均衡的一部分。

WALT結果用途

　　1. 負載均衡（task migration）

以can_migrate_task()函數為例：

通過task_util()獲取該task的demand，即task級負載

cpu_util_cum()獲取cpu rq的累計demand，即cpu級負載

如果 dst_cpu累計demand + task_demand > src_cpu累計demand + task_demand，那么說明不滿足遷移條件。

/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
    ...
        demand = task_util(p);　　//獲取task負載
        util_cum_dst = cpu_util_cum(env->dst_cpu, 0) + demand;　　//cpu_util_cum獲取cpu負載
        util_cum_src = cpu_util_cum(env->src_cpu, 0) - demand;
    
        if (util_cum_dst > util_cum_src)
            return 0;
    ...
}

還有文章前面講到的irqload、預測的demand都會影響負載均衡。

　　2. CPU freq調節

一共有如下3條路徑的函數來通過WALT修改cpu freq，

walt_irq_work()：walt中斷工作
scheduler_tick()：周期調度中early detection情況（調度器發現已存在task處於runnable狀態超過了SCHED_EARLY_DETECTION_DURATION，那么調度器就會通知governor接下來，需要提高cpu freq，具體解釋如下）

A further enhancement during boost is the scheduler' early detection feature.
While boost is in effect the scheduler checks for the precence of tasks that
have been runnable for over some period of time within the tick. For such
tasks the scheduler informs the governor of imminent need for high frequency.
If there exists a task on the runqueue at the tick that has been runnable
for greater than SCHED_EARLY_DETECTION_DURATION amount of time, it notifies
the governor with a fabricated load of the full window at the highest
frequency. The fabricated load is maintained until the task is no longer
runnable or until the next tick.

try_to_wake_up()：進程喚醒

walt_irq_work()  
scheduler_tick()  --> flag = SCHED_CPUFREQ_WALT --> cpufreq_update_util(cpu_rq(cpu),flag)
try_to_wake_up()  

調節頻率有兩種governoer，分別為原先的[CPU FREQ governors] 以及新的[schedtuil governors]

1、CPU FREQ governor
static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
                unsigned int delay_us)
{
    ...
        cpufreq_add_update_util_hook(cpu, &cdbs->update_util,
                         dbs_update_util_handler);
    ...
}


2、[schedutil] cpu freq governors
static int sugov_start(struct cpufreq_policy *policy)
{
    ...
        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
                         policy_is_shared(policy) ?
                            sugov_update_shared :
                            sugov_update_single);
    ...
}

當前以schedutil為例：

其中，在walt.c的 freq_policy_load中，會返回WALT計算的load（util），用於sugov_next_freq_shared中計算新的cpu freq。

static inline u64 freq_policy_load(struct rq *rq)
{
    unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
    struct sched_cluster *cluster = rq->cluster;
    u64 aggr_grp_load = cluster->aggr_grp_load;
    u64 load, tt_load = 0;
    struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu_of(rq));

    if (rq->ed_task != NULL) {　　　　　　//early detection task的情況，load = 一個window的長度（20ms）
        load = sched_ravg_window;
        goto done;
    }

    if (sched_freq_aggr_en)　　　　　　　　　　　　　　　　　　　　//freq聚合影響不同的load計算，在sched_boost設置為full_throttle_boost和restrained_boost時enable，退出時disable
        load = rq->prev_runnable_sum + aggr_grp_load;
    else
        load = rq->prev_runnable_sum + rq->grp_time.prev_runnable_sum;

    if (cpu_ksoftirqd && cpu_ksoftirqd->state == TASK_RUNNING)
        load = max_t(u64, load, task_load(cpu_ksoftirqd));　　//如果正在執行軟中斷的load

    tt_load = top_task_load(rq);　　　　　　　　　　　　　　　　//獲取top task的load
    switch (reporting_policy) {　　　　　　　　　　　　　　//根據不同的report policy，選擇load上報
    case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
        load = max_t(u64, load, tt_load);　　　　　　　　//取其中最大的load
        break;
    case FREQ_REPORT_TOP_TASK:
        load = tt_load;
        break;
    case FREQ_REPORT_CPU_LOAD:
        break;
    default:
        break;
    }

    if (should_apply_suh_freq_boost(cluster)) {　　　　　　//是否需要apply freq boost
        if (is_suh_max())
            load = sched_ravg_window;
        else
            load = div64_u64(load * sysctl_sched_user_hint,
                     (u64)100);
    }

done:
    trace_sched_load_to_gov(rq, aggr_grp_load, tt_load, sched_freq_aggr_en,
                load, reporting_policy, walt_rotation_enabled,
                sysctl_sched_user_hint);
    return load;
}

WALT vs PELT

總結一下WALT的優點：

1、識別heavy task的速度更快。

2、針對cpu util的計算快，從而可以更快控制cpu freq上升和下降。

cumulative_runnable_avg_scaled

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 CPU負載均衡之WALT學習【轉】 window.onload和$(window).load()區別 jQuery document window load ready 區別詳解 $(window).on("load",function(){} 和 $(document).ready(function() {} $(document).ready() $(window).load 及js的window.onload jquery中的$(document).ready(function(){})和$(window).load()比較 jq方法中 $(window).load() 與 $(document).ready() 的區別從一個例子了解window.onload、$(function(){})、$(window).load(function(){})的加載順序 MOSSE_Visual Object Tracking using Adaptive Correlation Filters學習筆記 spark SQL學習（load和save操作）

WALT（Window Assisted Load Tracking）學習

WALT核心結構體

負載記錄

WALT機制觸發的時刻

WALT主要機制

1. task load，cpu load統計

2. IRQ load統計

WALT結果用途

1. 負載均衡（task migration）

2. CPU freq調節

WALT vs PELT

免責聲明！

　　1. task load，cpu load統計

　　2. IRQ load統計

　　1. 負載均衡（task migration）

　　2. CPU freq調節