調度器26—Linux內核中的各種時間頻率


一、各種時間的打印

1. per-cpu的各種類型的使用時間

# ls -l /proc/stat
-r--r--r-- 1 root root 0 2021-01-01 19:46 /proc/stat
# cat /proc/stat
cpu  203632 46353 386930 31815547 3869 274339 68486 0 0 0
cpu0 26704 7709 39012 3916272 49 87626 23620 0 0 0
cpu1 14682 9898 25125 4055433 68 8755 3338 0 0 0
cpu2 5588 8202 7818 4098854 47 2215 901 0 0 0
cpu3 21765 10971 40654 4014299 341 19606 3900 0 0 0
cpu4 28157 1362 52559 3983416 725 25697 6661 0 0 0
cpu5 58390 2212 140189 3718682 1273 96146 17063 0 0 0
cpu6 42753 1587 70162 3930832 1008 32193 11836 0 0 0
cpu7 5588 4407 11408 4097755 355 2097 1164 0 0 0
intr 71408793 0 32194638 9259224 0 0 56084 91247 0 0 0 0 0 0 0 0 0 0 0 0 0 23940117 0 0 0 0 1022833 0 0 0 0 0 0 0 0 739 1176966 83 213 253 2243389 758 207033 6503 1916 0 0 9173 0 12210 0 0 0 0 0 140 0 0 10 2058 554 0 0 0 18070 0 0 5083 0 0 0 0 224 0 48 0 0 0 2984 0 0 0 29162 0 49591 0 9466 0 0 0 0 0 0 0 0 159 159 0 0 374 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8365 0 0 0 0 25095 0 0 0 3686 0 0 7767 0 0 0 0 0 0 0 0 0 16034 0 0 0 0 0 231848 0 0 0 25090 0 0 0 3558 0 0 8736 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3144 0 3036 181465 0 0 1400 2 1403 1 504929 32592 637 0 0 12 15 0 0 3 0 3 30 0 0 2 0 6653 9 0 279 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 168 0 0 0 0 96 0 8 0 0 0 0 0 0 0 0 0 0 520 40 0 0 0 0 131 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 98 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 133 0 1 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 8 0 0 0 2 67 98 126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ctxt 61029826
btime 1609501574
processes 27212
procs_running 1
procs_blocked 0
softirq 8564148 1172 1338008 1 3 243852 0 1611 5229125 0 1750376

對應的時間類型定義在內核頭文件 include/linux/kernel_stat.h,上圖中 cpu[0...7] 后的數值跟這些類型依次對應:

/*
 * 'kernel_stat.h' contains the definitions needed for doing
 * some kernel statistics (CPU usage, context switches ...),
 * used by rstatd/perfmeter
 */
enum cpu_usage_stat {
    CPUTIME_USER, //用戶空間占用cpu時間
    CPUTIME_NICE, //高nice任務(第優先級),用戶空間占用時間
    CPUTIME_SYSTEM, //內核態占用cpu時間
    CPUTIME_SOFTIRQ, //軟中斷占用cpu時間
    CPUTIME_IRQ, //硬中斷占用cpu時間
    CPUTIME_IDLE, //cpu空閑時間
    CPUTIME_IOWAIT, //cpu等待io時間
    CPUTIME_STEAL, //GuestOS等待real cpu時間
    CPUTIME_GUEST, //GuestOS消耗的時間
    CPUTIME_GUEST_NICE, //高nice任務(第優先級),GuestOS消耗的時間
    NR_STATS,
};

打印函數為 fs/proc/stat.c 中的 show_stat(),單位為 jiffie。在linux系統中,cputime模塊具有重要的意義。它記錄了設備中所有cpu在各個狀態下經過的時間。我們所熟悉的top工具就是用cputime換算出的cpu利用率。

 

2. per-cluster的在其各個頻點下駐留的時間

cpufreq_stats 模塊的開啟需要使能 CONFIG_CPU_FREQ_STAT 宏。當系統使能該特性后,cpufreq driver sysfs下生成 stats 目錄:

/sys/devices/system/cpu/cpufreq/policy0/stats # ls -l
total 0
--w-------    reset //可以對統計進行reset
-r--r--r--    time_in_state //本cluster在各頻點下駐留的時間,單位jiffy
-r--r--r--    total_trans //頻點之間總切換次數
-r--r--r--    trans_table //頻點轉換表

# cat /sys/devices/system/cpu/cpufreq/policy0/stats/time_in_state
1800000 5647
1700000 7
...
200000 4221664

表示的是該 cpufreq policy 內分別處於各個頻點的時間,單位為 jiffies。有了這個功能,我們就能獲取每個 cluster 運行最多的頻點是哪些,進而針對性的對系統功耗性能進行優化。

 

3. per-線程在各個頻點下駐留的時間

# cat /proc/913/time_in_state
cpu0
1800000 0
...
1250000 2638
...
200000 0
cpu4
2850000 0
...
200000 0
cpu7
3050000 0
...
1300000 9

該節點記錄了該線程在各個 cpufreq policy 的各個頻點下駐留的時間, 單位為 clock_t。clock_t 是由 USER_HZ 來決定,該系統中 USER_HZ 為250,則 clock_t 代表4ms。

 

4. per-cpu的cpuidle time

# ls -l /sys/devices/system/cpu/cpu0/cpuidle
drwxr-xr-x    driver
drwxr-xr-x    state0
drwxr-xr-x    state1
drwxr-xr-x    state2
drwxr-xr-x    state3
drwxr-xr-x    state4
drwxr-xr-x    state5
drwxr-xr-x    state6

# ls -l /sys/devices/system/cpu/cpu0/cpuidle/state0
...
-r--r--r-- 1 root root 4096 2021-01-02 19:51 time

# cat /sys/devices/system/cpu/cpu0/cpuidle/state*/time
2675541339
13746613328
0
0
460
24621035515
0

cpuidle time 模塊的工作就是記錄每個cpu在各層深度中睡了多久,即每次開機以來,每個核在每個 C-state下的時長,單位為 us。


二、各種時間統計原理

1. per-cpu的各種類型的使用時間

cputime 模塊代碼位於 kernel/sched/cputime.c。由上圖可見,統計的時間精度是1個tick。當每次timer中斷來臨時,kernel經過由中斷處理函數調用到 irqtime_account_process_tick()(需要使能特性宏 CONFIG_IRQ_TIME_ACCOUNTING,將irq/softirq的統計囊括其中)。通過判斷當前task是否為 softirq/user tick/idle進程/guest系統進程/內核進程,將經歷的cpu時間(通常為1個tick)統計到各個類型中去。

/*
 * Account a tick to a process and cpustat
 * @p: the process that the CPU time gets accounted to
 * @user_tick: is the tick from userspace
 * @rq: the pointer to rq
 *
 * Tick demultiplexing follows the order
 * - pending hardirq update
 * - pending softirq update
 * - user_time
 * - idle_time
 * - system time
 *   - check for guest_time
 *   - else account as system_time
 *
 * Check for hardirq is done both for system and user time as there is
 * no timer going off while we are on hardirq and hence we may never get an
 * opportunity to update it solely in system time.
 * p->stime and friends are only updated on system time and not on irq
 * softirq as those do not count in task exec_runtime any more.
 */
static void irqtime_account_process_tick(struct task_struct *p, int user_tick, int ticks)
{
    u64 other, cputime = TICK_NSEC * ticks;

    /*
     * When returning from idle, many ticks can get accounted at
     * once, including some ticks of steal, irq, and softirq time.
     * Subtract those ticks from the amount of time accounted to
     * idle, or potentially user or system time. Due to rounding,
     * other time can exceed ticks occasionally.
     */
    other = account_other_time(ULONG_MAX);
    if (other >= cputime)
        return;

    cputime -= other;

    if (this_cpu_ksoftirqd() == p) {
        /*
         * ksoftirqd time do not get accounted in cpu_softirq_time.
         * So, we have to handle it separately here.
         * Also, p->stime needs to be updated for ksoftirqd.
         */
        account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
    } else if (user_tick) {
        account_user_time(p, cputime);
    } else if (p == this_rq()->idle) {
        account_idle_time(cputime);
    } else if (p->flags & PF_VCPU) { /* System time or guest time */
        account_guest_time(p, cputime);
    } else {
        account_system_index_time(p, cputime, CPUTIME_SYSTEM);
    }
}

 

2. per-cluster的在其各個頻點下駐留的時間

cpufreq_times 模塊代碼位於 drivers/cpufreq/cpufreq_times.c,它的更新涉及到 cpufreq driver 與 cputime 兩個模塊。當 cpufreq policy 頻率改變時,cpufreq driver 通過 cpufreq_notify_transition(普通調頻模式)或者 cpufreq_driver_fast_switch(快速調頻模式)調用 cpufreq_times_record_transition 函數,通知 cpufreq_times 模塊當前該 policy 處於哪一個頻點。當 cputime 模塊接收到 timer 中斷后,會調用 cpufreq_acct_update_power(),將該 tick 添加到 cpufreq_times 模塊當前任務及當前頻點的統計上。

 

3. per-線程在各個頻點下駐留的時間

cpufreq_stats 模塊代碼位於 drivers/cpufreq/cpufreq_stats.c。它的更新有些類似於 cpufreq_times, 但與其不同的是只涉及 cpufreq driver 一個外部模塊。當 cpufreq policy 頻率改變時,cpufreq driver 通過 cpufreq_notify_transition(普通調頻模式)或者 cpufreq_driver_fast_switch(快速調頻模式)調用 cpufreq_times_record_transition 函數調用 cpufreq_stats_record_transition 函數,通知 cpufreq_stats 模塊此刻發生調頻以及要切換到哪一個目標頻點。cpufreq_state 模塊則調用 cpufreq_stats_update 獲取當前 jiffies, 並與上一次更新時的 jiffies 相減,最后將差值添加到上個頻點的時間統計中:

//drivers\cpufreq\cpufreq_stats.c
static void cpufreq_stats_update(struct cpufreq_stats *stats, unsigned long long time)
{
    unsigned long long cur_time = get_jiffies_64();

    stats->time_in_state[stats->last_index] += cur_time - time;
    stats->last_time = cur_time;
}

 

4. per-cpu的cpuidle time

cpuidle time 模塊代碼在 drivers/cpuidle/cpuidle.c。當某個 cpu runqueue 上沒有 runnable 狀態的任務時,該cpu調度到idle進程,經過層層調用,最后執行到 cpuidle_enter_state()函數。

/**
 * cpuidle_enter_state - enter the state and update stats
 * @dev: cpuidle device for this cpu
 * @drv: cpuidle driver for this cpu
 * @index: index into the states table in @drv of the state to enter
 */
int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) //drivers/cpuidle/cpuidle.c
{
    int entered_state;
    ktime_t time_start, time_end;
    
    ...
    time_start = ns_to_ktime(local_clock());
    ...
    entered_state = target_state->enter(dev, drv, index);
    ...
    time_end = ns_to_ktime(local_clock());
    ...
    diff = ktime_sub(time_end, time_start);
    ...
    dev->last_residency_ns = diff;
    dev->states_usage[entered_state].time_ns += diff;
    ...
}

 

三、內核中打印時間

1. 內核中打印UTC時間:

static void kernel_printk_utc_time(char *annotation) //參考kernel-4.19
{
    struct timespec ts;
    struct rtc_time tm;
    getnstimeofday(&ts);
    rtc_time_to_tm(ts.tv_sec, &tm);
    pr_info("PM: wakeup_count %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n",
        annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
}

 

2. kernel log中打印內核時間

static size_t print_time(u64 ts, char *buf) //printk.c 傳參 ts = local_clock();
{
    unsigned long rem_nsec = do_div(ts, 1000000000);

    return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000);
}

printk()的打印路徑:

printk //printk.c
    vprintk_func
        vprintk_default
            vprintk_emit
                vprintk_store //printk.c
                    log_output //printk.c
                        log_store //ts_nsec參數傳0了,若傳個案發的時間將更精確一些

 

3. 獲取一段代碼的執行時間:

t1 = local_clock();
//XXX
t2 = local_clock();
dela_t = t2 -t1;

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM