Linux ftrace框架介紹及運用

本文轉載自查看原文 2017-12-30 22:39 7171 Linux相關學習總結

1. ftrace背景

2. 框架介紹

3. 主要代碼分析

4. ftrace的配置和使用

5. ftrace相關工具

在日常工作中，經常會需要對內核進行Debug、或者進行優化工作。一些簡單的問題，可以通過dmesg/printk查看，優化借助一些工具進行。

但是當問題邏輯復雜，優化面寬泛的時候，往往無從下手。

需要從上到下、模塊到模塊之間分析，這時候就不得不借助於Linux提供的靜態(Trace Event)動態(各種Tracer)進行分析。

同時還不得不借助工具、或者編寫腳本進行分析，以縮小問題范圍、發現問題。

簡單的使用Tracepoint已經不能滿足需求，因此就花點精力進行梳理。

1. ftrace背景

ftrace是Function Trace的意思，最開始主要用於記錄內核函數運行軌跡；隨着功能的逐漸增加，演變成一個跟蹤框架。

包含了靜態tracepoint，針對不同subsystem提供一個目錄進行開關；還包括不同的動態跟蹤器，function、function_graph、wakeup等等。

ftrace的幫助文檔在Documentation/trace，ftrace代碼主要在kernel/trace，ftrace相關頭文件在include/trace中。

WiKiPedia有關於ftrace的簡單介紹。

ftrace的作者在LinuxCon 2010有一篇關於Ftrace Linux Kernel Tracing的slides值得一讀。

2. ftrace框架介紹

整個ftrace框架可以分為幾部分：ftrace核心框架，RingBuffer，debugfs，Tracepoint，各種Tracer。

ftrace框架是整個ftrace功能的紐帶，包括對內和的修改，Tracer的注冊，RingBuffer的控制等等。

RingBuffer是靜態動態ftrace的載體。

debugfs則提供了用戶空間對ftrace設置接口。

Tracepoint是靜態trace，他需要提前編譯進內核;可以定制打印內容，自由添加;並且內核對主要subsystem提供了Tracepoint。

Tracer有很多種，主要幾大類：

函數類：function， function_graph， stack

延時類：irqsoff， preemptoff， preemptirqsoff， wakeup， wakeup_rt， waktup_dl

其他類：nop， mmiotrace， blk

ftrace 組成

2.1 ftrace核心初始化

trace.c是ftrace的核心，包括三個initcall：tracer_alloc_buffers、trace_init_debugfs、clear_boot_tracer。

start_kernel-->
    trace_init-->
        tracer_alloc_buffers--------------------分配ftrace需要的RingBuffer
            register_tracer(&nop_trace)---------默認nop跟蹤器
        trace_event_init------------------------創建靜態Trace Event，進行初始化。


fs_initcall(tracer_init_debugfs);-->
    tracing_init_dentry--------------------------在sys/kernel/debug下，創建tracing目錄
    init_tracer_debugfs--------------------------
        ftrace_create_function_files-------------創建主要的tracing目錄下節點
            allocate_ftrace_ops------------------function_trace_call
            ftrace_create_filter_files-----------創建function tracer相關的節點set_ftrace_filter/set_ftrace_notrace
    trace_create_file----------------------------創建saved_cmdlines等
    create_trace_instances-----------------------創建tracing/instances/目錄
    create_trace_options_dir---------------------創建tracing/optoins/目錄
    tracing_init_debugfs_percpu------------------創建per_cpu目錄
late_initcall(clear_boot_tracer);

tracer_alloc_buffers主要申請一個最小1KB的RingBuffer，然后注冊一些Notifier和初始化一些列表。

__init static int tracer_alloc_buffers(void)
{
...
    /* To save memory, keep the ring buffer size to its minimum */
    if (ring_buffer_expanded)----------------------------------------------在初始化的時候，分配一個最小量
        ring_buf_size = trace_buf_size;
    else
        ring_buf_size = 1;
...
    /* TODO: make the number of buffers hot pluggable with CPUS */
    if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {--------分配RingBuffer內存
        printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
        WARN_ON(1);
        goto out_free_savedcmd;
    }
...
    /*
     * register_tracer() might reference current_trace, so it
     * needs to be set before we register anything. This is
     * just a bootstrap of current_trace anyway.
     */
    global_trace.current_trace = &nop_trace;----------------------------默認nop跟蹤器

    global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;

    ftrace_init_global_array_ops(&global_trace);

    register_tracer(&nop_trace);----------------------------------------注冊nop跟蹤器 /* All seems OK, enable tracing */
    tracing_disabled = 0;

    atomic_notifier_chain_register(&panic_notifier_list,
                       &trace_panic_notifier);

    register_die_notifier(&trace_die_notifier);

    global_trace.flags = TRACE_ARRAY_FL_GLOBAL;

    INIT_LIST_HEAD(&global_trace.systems);
    INIT_LIST_HEAD(&global_trace.events);
    list_add(&global_trace.list, &ftrace_trace_arrays);

    while (trace_boot_options) {
        char *option;

        option = strsep(&trace_boot_options, ",");
        trace_set_options(&global_trace, option);
    }

    register_snapshot_cmd();------------------------------------------創建snapshot節點 ...
}

2.2 trace event

在trace_init-->trace_event_init-->event_trace_enable中，已經創建了tracing/events下的節點，並且做好了准備工作。

event_trace_enable被初次調用的地方很靠前，甚至在pid 1之前。

void __init trace_event_init(void)
{
    event_trace_memsetup();------------------------------------------創建field_cachep、file_cachep高速緩存
    init_ftrace_syscalls();------------------------------------------對所有系統調用如后保存在syscalls_metadata
    event_trace_enable();--------------------------------------------在系統啟動階段初始化Trace Event，在debug創建后在附着到上面
}

static __init int event_trace_enable(void)
{
    struct trace_array *tr = top_trace_array();
    struct ftrace_event_call **iter, *call;
    int ret;

    if (!tr)
        return -ENODEV;

    for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {----------遍歷所有的Events，放入ftrace_events鏈表

        call = *iter;
        ret = event_init(call);
        if (!ret)
            list_add(&call->list, &ftrace_events);
    }

    /*
     * We need the top trace array to have a working set of trace
     * points at early init, before the debug files and directories
     * are created. Create the file entries now, and attach them
     * to the actual file dentries later.
     */
    __trace_early_add_events(tr);

    early_enable_events(tr, false);

    trace_printk_start_comm();

    register_event_cmds();--------------------------------------------------------注冊enable_event、disable_event兩個命令

    register_trigger_cmds();------------------------------------------------------注冊traceon、traceoff、snapshot等命令 return 0;
}

其他Events相關初始化如下：

early_initcall(event_trace_enable_again);----------在trace_event_init已經調用過early_enable_events，這里在early_initcall再次使能。

fs_initcall(event_trace_init);---------------------創建available_events節點

2.3 function跟蹤器

function tracer的初始化通過init_function_trace建立。

core_initcall(init_function_trace);

init_function_trace在init_func_cmd_traceon中，創建了一系列命令：traceon/traceoff/stacktrace/dump/cpudump。然后是注冊function_trace這個tracer。

Tracer結構體如下:

static struct tracer function_trace __tracer_data =
{
    .name        = "function",
    .init        = function_trace_init,--------------通過echo function > current_tracer觸發
    .reset        = function_trace_reset,------------通過echo 1 >tracing_on觸發
    .start        = function_trace_start,------------通過echo 0 >tracing_off觸發
    .flags        = &func_flags,
    .set_flag    = func_set_flag,
    .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
    .selftest    = trace_selftest_startup_function,
#endif
}

function_trace_init進行function tracer相關設置。

2.4 function_graph跟蹤器

注冊function_graph跟蹤器，創建max_graph_depth節點。

static struct tracer graph_trace __tracer_data = {
    .name        = "function_graph",
    .update_thresh    = graph_trace_update_thresh,
    .open        = graph_trace_open,
    .pipe_open    = graph_trace_open,
    .close        = graph_trace_close,
    .pipe_close    = graph_trace_close,
    .init        = graph_trace_init,
    .reset        = graph_trace_reset,
    .print_line    = print_graph_function,
    .print_header    = print_graph_headers,
    .flags        = &tracer_flags,
    .set_flag    = func_graph_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
    .selftest    = trace_selftest_startup_function_graph,
#endif
};

static const struct file_operations graph_depth_fops = {
    .open        = tracing_open_generic,
    .write        = graph_depth_write,
    .read        = graph_depth_read,
    .llseek        = generic_file_llseek,
};

static __init int init_graph_debugfs(void)
{
    struct dentry *d_tracer;

    d_tracer = tracing_init_dentry();
    if (IS_ERR(d_tracer))
        return 0;

    trace_create_file("max_graph_depth", 0644, d_tracer,
              NULL, &graph_depth_fops);

    return 0;
}
fs_initcall(init_graph_debugfs);

static __init int init_graph_trace(void)
{
    max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);

    if (!register_ftrace_event(&graph_trace_entry_event)) {
        pr_warning("Warning: could not register graph trace events\n");
        return 1;
    }

    if (!register_ftrace_event(&graph_trace_ret_event)) {
        pr_warning("Warning: could not register graph trace events\n");
        return 1;
    }

    return register_tracer(&graph_trace);--------------------------------------注冊function_graph跟蹤器
}

core_initcall(init_graph_trace);

2.5 irqoff/preemptoff/preemptirqoff跟蹤器

irqoff/preemptoff/preemptirqoff：注冊irqsoff、preemptoff、preemptirqsoff三個跟蹤器。

core_initcall(init_irqsoff_tracer);

2.6 wakeup跟蹤器

注冊wakeup、wakeup_rt、wakeup_dl三個跟蹤器。

core_initcall(init_wakeup_tracer);

2.7 mmiotrace跟蹤器

mmiotrace：注冊mmiotrace跟蹤器

device_initcall(init_mmio_trace);

2.8 branch跟蹤器

branch：注冊branch跟蹤器，和branch_annotated、branch_all兩個統計信息跟蹤器。

core_initcall(init_branch_tracer);
fs_initcall(init_annotated_branch_stats);
fs_initcall(all_annotated_branch_stats);

2.9 blk跟蹤器

blk：注冊blk跟蹤器。

device_initcall(init_blk_tracer);

trace_printk：

fs_initcall(init_trace_printk_function_export);
early_initcall(init_trace_printk);

stack_trace：

device_initcall(stack_trace_init);

kprobe/upbobe：

fs_initcall(init_kprobe_trace);
fs_initcall(init_uprobe_trace);

3. 主要代碼分析

3.1 內核模塊初始化順序

內核中不同功能需要有序初始化，但是相同等級的順序是沒有保證的。

Linux ftrace相關的模塊眾多，使用了不同等級的initcall。

從下面的定義可以看出他們在內核啟動時的調用順序，模塊等級在include/linux/init.h中定義：

#define __define_initcall(level,fn,id) \
    static initcall_t __initcall_##fn##id __used \
    __attribute__((__section__(".initcall" level ".init"))) = fn

/*
 * Early initcalls run before initializing SMP.
 *
 * Only for built-in code, not modules.
 */
#define early_initcall(fn)        __define_initcall("early",fn,early)--------------------所謂的early就是在初始化SMP之前調用

/*
 * A "pure" initcall has no dependencies on anything else, and purely
 * initializes variables that couldn't be statically initialized.
 *
 * This only exists for built-in code, not for modules.
 */
#define pure_initcall(fn)        __define_initcall("0",fn,0)

#define core_initcall(fn)        __define_initcall("1",fn,1)
#define core_initcall_sync(fn)        __define_initcall("1s",fn,1s)
#define postcore_initcall(fn)        __define_initcall("2",fn,2)
#define postcore_initcall_sync(fn)    __define_initcall("2s",fn,2s)
#define arch_initcall(fn)        __define_initcall("3",fn,3)
#define arch_initcall_sync(fn)        __define_initcall("3s",fn,3s)
#define subsys_initcall(fn)        __define_initcall("4",fn,4)
#define subsys_initcall_sync(fn)    __define_initcall("4s",fn,4s)
#define fs_initcall(fn)            __define_initcall("5",fn,5)
#define fs_initcall_sync(fn)        __define_initcall("5s",fn,5s)
#define rootfs_initcall(fn)        __define_initcall("rootfs",fn,rootfs)
#define device_initcall(fn)        __define_initcall("6",fn,6)----------------------------對應module_init
#define device_initcall_sync(fn)    __define_initcall("6s",fn,6s)
#define late_initcall(fn)        __define_initcall("7",fn,7)
#define late_initcall_sync(fn)        __define_initcall("7s",fn,7s)

3.2 ftrace初始化

3.3 RingBuffer

4. ftrace的配置和使用

/sys/kernel/debug/tracing目錄下提供了ftrace的設置和屬性接口，對ftrace的配置可以通過echo。了解每個文件的作用和如何設置對於理解整個ftrace框架很有作用。

下面是Ubuntu 16.04+Kernel 4.10.0-42-generic的/sys/kernel/debug/tracing目錄：

README------------------------一個簡單的關於Tracepoing的HOWTO，cat讀取，echo設置。

通用配置： available_tracers-------------當前編譯及內核的跟蹤器列表，current_tracer必須是這里面支持的跟蹤器。

current_tracer----------------用於設置或者顯示當前使用的跟蹤器列表。系統啟動缺省值為nop，使用echo將跟蹤器名字寫入即可打開。可以通過寫入nop重置跟蹤器。


buffer_size_kb----------------用於設置單個CPU所使用的跟蹤緩存的大小。跟蹤緩存為RingBuffer形式，如果跟蹤太多，舊的信息會被新的跟蹤信息覆蓋掉。需要先將current_trace設置為nop才可以。
buffer_total_size_kb----------顯示所有的跟蹤緩存大小，不同之處在於buffer_size_kb是單個CPU的，buffer_total_size_kb是所有CPU的和。

free_buffer-------------------此文件用於在一個進程被關閉后，同時釋放RingBuffer內存，並將調整大小到最小值。
hwlat_detector/
instances/--------------------創建不同的trace buffer實例，可以在不同的trace buffers中分開記錄。

tracing_cpumask---------------可以通過此文件設置允許跟蹤特定CPU，二進制格式。
per_cpu-----------------------CPU相關的trace信息，包括stats、trace、trace_pipe和trace_pipe_raw。
　　　　　　　　　　　　　　　　　　stats：當前CPU的trace統計信息
　　　　　　　　　　　　　　　　　　trace：當前CPU的trace文件。
　　　　　　　　　　　　　　　　　　trace_pipe：當前CPU的trace_pipe文件。
printk_formats----------------提供給工具讀取原始格式trace的文件。
saved_cmdlines----------------存放pid對應的comm名稱作為ftrace的cache，這樣ftrace中不光能顯示pid還能顯示comm。
saved_cmdlines_size-----------saved_cmdlines的數目
snapshot----------------------是對trace的snapshot。
                              echo 0清空緩存，並釋放對應內存。
                              echo 1進行對當前trace進行snapshot，如沒有內存則分配。
                              echo 2清空緩存，不釋放也不分配內存。


trace-------------------------查看獲取到的跟蹤信息的接口，echo > trace可以清空當前RingBuffer。

trace_pipe--------------------輸出和trace一樣的內容，但是此文件輸出Trace同時將RingBuffer中的內容刪除，這樣就避免了RingBuffer的溢出。可以通過cat trace_pipe > trace.txt &保存文件。


trace_clock-------------------顯示當前Trace的timestamp所基於的時鍾，默認使用local時鍾。local：默認時鍾；可能無法在不同CPU間同步；global：不同CUP間同步，但是可能比local慢；counter：這是一個跨CPU計數器，需要分析不同CPU間event順序比較有效。
trace_marker------------------從用戶空間寫入標記到trace中，用於用戶空間行為和內核時間同步。
trace_marker_raw--------------以二進制格式寫入到trace中。


trace_options-----------------控制Trace打印內容或者操作跟蹤器，可以通過trace_options添加很多附加信息。
options-----------------------trace選項的一系列文件，和trace_options對應。


trace_stat/-------------------每個CPU的Trace統計信息
tracing_max_latency-----------記錄Tracer的最大延時，
tracing_on--------------------用於控制跟蹤打開或停止，0停止跟蹤，1繼續跟蹤。
tracing_thresh----------------延時記錄Trace的閾值，當延時超過此值時才開始記錄Trace。單位是ms，只有非0才起作用。

Events配置： available_events--------------列出系統中所有可用的Trace events，分兩個層級，用冒號隔開。
events/-----------------------系統Trace events目錄，在每個events下面都有enable、filter和fotmat。enable是開關；format是events的格式，然后根據格式設置 filter。
set_event---------------------將Trace events名稱直接寫入set_event就可以打開。
set_event_pid-----------------指定追蹤特定進程的events。

Function配置： available_filter_functions----記錄了當前可以跟蹤的內核函數，不在該文件中列出的函數，無法跟蹤其活動。
dyn_ftrace_total_info---------顯示available_filter_functins中跟中函數的數目，兩者一致。
enabled_functions-------------顯示有回調附着的函數名稱。
function_profile_enabled------打開此選項，在trace_stat中就會顯示function的統計信息。
set_ftrace_filter-------------用於顯示指定要跟蹤的函數
set_ftrace_notrace------------用於指定不跟蹤的函數，缺省為空。
set_ftrace_pid----------------用於指定要追蹤特定進程的函數。

Function graph配置： max_graph_depth---------------函數嵌套的最大深度。
set_graph_function------------設置要清晰顯示調用關系的函數，在使用function_graph跟蹤器是使用，缺省對所有函數都生成調用關系。
set_graph_notrace-------------不跟蹤特定的函數嵌套調用。

Stack trace設置： stack_max_size----------------當使用stack跟蹤器時，記錄產生過的最大stack size
stack_trace-------------------顯示stack的back trace
stack_trace_filter------------設置stack tracer不檢查的函數名稱

Kernel dynamic events:
kprobe_events
kprobe_profile

Userspace dynamic events:
uprobe_events
uprobe_profile

4.1 通用配置

使能和配置大小

常用的配置有對Trace的開關(tracing_on)

echo 0/1 > /sys/kernel/debug/tracing/tracing_on

設置RingBuffer大小(buffer_size_kb),同時buffer_total_size_kb就變成NR_CPUS的倍數。

trace、trace_pipe和snapshot的區別？

trace是從RingBuffer中取出內容，trace_pipe會一直讀取Buffer流。

snapshot是trace的一個瞬間快照：

echo 0 > snapshot : Clears and frees snapshot buffer
echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.
                      Takes a snapshot of the main buffer.
echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)
                      (Doesn't have to be '2' works with any number that
                       is not a '0' or '1')

Tracer

從available_tracers可以獲取系統支持的Tracer,current_tracer是當前使用中的Tracer。

Events只有在nop tracer下才會起作用，同時多個tracer不能共享。同一時候只能一個Tracer在生效。

cat available_tracers

hwlat blk mmiotrace function_graph wakeup_dl wakeup_rt wakeup function nop

echo function > current_tracer

instances

在instances目錄下，可以通過mkdir創建instance，rmdir刪除instance。

新目錄下，有很多類似tracing下的文件，可以對其進行配置。然后讀取專有的trace/trace_pipe。

mkdir foo

ls foo

available_tracers     options             set_ftrace_pid    trace_options
buffer_size_kb        per_cpu             snapshot          trace_pipe
buffer_total_size_kb set_event           trace             tracing_cpumask
current_tracer        set_event_pid       trace_clock       tracing_max_latency
events                set_ftrace_filter   trace_marker      tracing_on
free_buffer           set_ftrace_notrace trace_marker_raw

rmdir foo

特定CPU信息

抓取特定CPU信息0~3：

echo f > tracing_cpumask

查看特定CPU信息：

cat per_cpu/cpu3/trace

用戶空間插入Trace標記

有時候需要往Trace中插入標記，trace_marker/trace_marker_raw提供了這樣功能。

echo CAPTURE_START > trace_marker

echo CAPTURE_START > trace_marker_raw

Trace選項設置

通過options內容設置，對Trace的輸出進行定制，控制輸出大小。

trace_option是options設置的結果，可以看出開了哪些選項，關閉了哪些選項。

echo 0/1 > options/irq-info

4.2 Tracepoint

可以對系統特定事件進行跟蹤，在available_events可以找到所有事件。然后將需要的時間通過echo xxx >> set_event寫入。也可以通過events目錄來打開。

4.2.1 Trace Events生效條件

在current_tracer為nop，然后設置tracing/events下面的enable，即可通過tracing/trace或者tracing/trace_pipe查看內容。

4.2.2 Trace Events的過濾功能？

對特定Events的過濾，只打開需要監控的Events。

如果要禁用某個時間在事件前加上！，如echo "!sys_enter_nic" >> set_event。

echo net >set_event--------------------------------打開所有net目錄下的事件

echo skb >>set_event------------------------------附加設置skb到目錄下

過濾的表達式是：field-name relational-operator value，多表達式可以通過邏輯運算符&&或者||來組合。

數字可以通過==、！=、>、<、&&、||等等來組合filter，來過濾掉很多不需要信息。

文字可以通過==、!=、~l來組合filter。

如針對timer_start事件的Trace：

# tracer: nop
#
# entries-in-buffer/entries-written: 327/327   #P:4
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
            Xorg-1684  [000] d... 1140567.102754: timer_start: timer=ffff8800b20300f0 function=intel_uncore_fw_release_timer [i915] expires=4579967822 [timeout=1] flags=0x00000001
      nxnode.bin-11900 [000] d.s. 1140567.105061: timer_start: timer=ffff8803aa293508 function=intel_pstate_timer_func expires=4579967825 [timeout=3] flags=0x00100000
          <idle>-0     [000] dNs. 1140567.108404: timer_start: timer=ffff8802cf845698 function=tcp_delack_timer expires=4579967832 [timeout=10] flags=0x00000000

在events目錄下，有很多子目錄。這些目錄里面可以使用filter，過濾很多不需要的信息。

通過format可以知道，timer_start這個時間的field名稱，然后在filter進行設置。

針對某一pid進行過濾。

將對應pid寫入set_event_pid，就可達到只監控某個進程的Events。

清空跟蹤器

對子系統的filter寫入0，即可清空整個子系統的filter

echo 0 > filter

4.2.3 Events的trigger功能

enable_event/disable_event

stacktrace

snapshot

traceon/traceoff

4.3 如何在Linux command line啟動Events

4.3 function跟蹤器及動態ftrace

function跟蹤器可以用於跟蹤內核函數的調用情況，用於調試分析bug或者了解內核運行過程。

4.3.1 打開Function跟蹤器

echo function > /sys/kernel/debug/tracing/current_tracer

4.3.2 在trace_stat中顯示function的統計信息

4.3.2.1 trace_stat的使用

trace_stat/function0在系統初始化就創建，通過function_profile_enabled進行開關。

統計的函數在set_ftrace_filter和set_ftrace_notrace中設置。

echo 0/1 > function_profile_enabled

在使能function優化功能之后，可以查看不同CPU下每個函數執行時間統計信息。

每列表示的內容分別是：函數名稱、調用次數、總耗時、平均耗時、耗時均方差。

cat trace_stat/function0

Function                               Hit    Time            Avg             s^2
   --------                               ---    ----            ---             ---
   schedule                             65154    1721953948 us     26428.98 us     940990.8 us
   schedule_hrtimeout_range              9655    920051856 us     95292.78 us     1284647 us
   schedule_hrtimeout_range_clock        9655    920046552 us     95292.23 us     1139416 us
   poll_schedule_timeout                 5562    768940036 us     138248.8 us     13748771 us
   do_sys_poll                           9188    485558063 us     52846.98 us     6242561 us
   SyS_poll                              8419    469986128 us     55824.45 us     6183267 us
   core_sys_select                       1853    283704721 us     153105.6 us     2109401 us
   do_select                             1853    283689546 us     153097.4 us     3187699 us
   SyS_futex                            40313    260910692 us     6472.122 us     8796046 us
   do_futex                             40313    260888660 us     6471.576 us     8397810 us
   futex_wait                           18824    260325650 us     13829.45 us     4756645 us

4.3.2.2 通用tracer_stat注冊register_stat_tracer()

register_stat_tracer()->init_stat_file()->

static const struct file_operations tracing_stat_fops = {
    .open        = tracing_stat_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_stat_release }; static int init_stat_file(struct stat_session *session) { if (!stat_dir && tracing_stat_init()) return -ENODEV; session->file = debugfs_create_file(session->ts->name, 0644, stat_dir, session, &tracing_stat_fops); ... } static int tracing_stat_open(struct inode *inode, struct file *file) { ... ret = stat_seq_init(session); if (ret) return ret; ret = seq_open(file, &trace_stat_seq_ops); ... } static const struct seq_operations trace_stat_seq_ops = { .start = stat_seq_start, .next = stat_seq_next, .stop = stat_seq_stop, .show = stat_seq_show }; static int stat_seq_show(struct seq_file *s, void *v) { ... return session->ts->stat_show(s, l->stat); }

4.3.2.3 function trace_stat流程

trace_stat/function0相關代碼流程如下：

ftrace_init_debugfs()->ftrace_profile_debugfs()->register_stat_tracer()

static const struct file_operations ftrace_profile_fops = {
    .open        = tracing_open_generic,
    .read        = ftrace_profile_read,
    .write        = ftrace_profile_write,
    .llseek        = default_llseek,
};

/* used to initialize the real stat files */
static struct tracer_stat function_stats __initdata = {
    .name        = "functions",
    .stat_start    = function_stat_start,
    .stat_next    = function_stat_next,
    .stat_cmp    = function_stat_cmp,
    .stat_headers    = function_stat_headers,
    .stat_show    = function_stat_show
};

static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
{
    struct ftrace_profile_stat *stat;
    struct dentry *entry;
    char *name;
    int ret;
    int cpu;

    for_each_possible_cpu(cpu) {
        stat = &per_cpu(ftrace_profile_stats, cpu);

        /* allocate enough for function name + cpu number */
        name = kmalloc(32, GFP_KERNEL);
...
        stat->stat = function_stats;
        snprintf(name, 32, "function%d", cpu);----------------------會在trace_stat目錄下創建function0節點，0表示CPU序號。
        stat->stat.name = name;
        ret = register_stat_tracer(&stat->stat);--------------------注冊function_stats ...
    }

    entry = debugfs_create_file("function_profile_enabled", 0644,
                    d_tracer, NULL, &ftrace_profile_fops);----------創建function_profile_enabled節點，函數集為ftrace_profile_fops ...
}

寫function_profile_enabled觸發代碼流程：

static ssize_t
ftrace_profile_write(struct file *filp, const char __user *ubuf,
             size_t cnt, loff_t *ppos)
{
    unsigned long val;
    int ret;

    ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
    if (ret)
        return ret;

    val = !!val;

    mutex_lock(&ftrace_profile_lock);
    if (ftrace_profile_enabled ^ val) {
        if (val) {
            ret = ftrace_profile_init();
            if (ret < 0) {
                cnt = ret;
                goto out;
            }

            ret = register_ftrace_profiler();
            if (ret < 0) {
                cnt = ret;
                goto out;
            }
            ftrace_profile_enabled = 1;
        } else {
            ftrace_profile_enabled = 0;
            /*
             * unregister_ftrace_profiler calls stop_machine
             * so this acts like an synchronize_sched.
             */
            unregister_ftrace_profiler();
        }
    }
 out:
    mutex_unlock(&ftrace_profile_lock);

    *ppos += cnt;

    return cnt;
}

static int register_ftrace_profiler(void)
{
    return register_ftrace_graph(&profile_graph_return,
                     &profile_graph_entry);
}

static void unregister_ftrace_profiler(void)
{
    unregister_ftrace_graph();
}

顯示profile結果

static int function_stat_show(struct seq_file *m, void *v)
{
    struct ftrace_profile *rec = v;
    char str[KSYM_SYMBOL_LEN];
    int ret = 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    static struct trace_seq s;
    unsigned long long avg;
    unsigned long long stddev;
#endif
    mutex_lock(&ftrace_profile_lock);

    /* we raced with function_profile_reset() */
    if (unlikely(rec->counter == 0)) {
        ret = -EBUSY;
        goto out;
    }

    kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
    seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    seq_printf(m, "    ");
    avg = rec->time;
    do_div(avg, rec->counter);

    /* Sample standard deviation (s^2) */
    if (rec->counter <= 1)
        stddev = 0;
    else {
        stddev = rec->time_squared - rec->counter * avg * avg;
        /*
         * Divide only 1000 for ns^2 -> us^2 conversion.
         * trace_print_graph_duration will divide 1000 again.
         */
        do_div(stddev, (rec->counter - 1) * 1000);
    }

    trace_seq_init(&s);
    trace_print_graph_duration(rec->time, &s);
    trace_seq_puts(&s, "    ");
    trace_print_graph_duration(avg, &s);
    trace_seq_puts(&s, "    ");
    trace_print_graph_duration(stddev, &s);
    trace_print_seq(m, &s);
#endif
    seq_putc(m, '\n');
out:
    mutex_unlock(&ftrace_profile_lock);

    return ret;
}

4.3.3 function跟蹤器的過濾器

在打開CONFIG_DYNAMIC_FTRACE的情況下，增加一些動態跟蹤功能，比如available_filter_functions、set_ftrace_filter、set_ftrace_notrace。

4.3.3.1 set_ftrace_filter跟蹤某些函數

1. 默認情況下set_ftrace_filter是全部函數都開的。
   cat set_ftrace_filter如下：
   #### all functions enabled ####

2. 如果想只監控某些函數，通過echo mod_timer add_timer > set_ftrace_filter即可。
   cat set_ftrace_filter如下：
   mod_timer
   add_timer
   如果要附加function，通過echo xxx >> set_ftrace_filter即可。
3. 使用通配符*
　　echo "sched*" > set_ftrace_filter----------------選擇所有以sched開頭的函數
　　echo "*sched*" > set_ftrace_filter---------------選擇所有包含sched的函數
　　echo "*sched" > set_ftrace_filter----------------選擇所有以sched結尾的函數

4. 如果想恢復全開，只需要echo > set_ftrace_filter，即清空filter。

4.3.3.2 set_ftrace_notrace指定不跟蹤哪個函數

echo xxx > set_ftrace_notrace

4.3.3.3 set_ftrace_pid只跟蹤某一個進程

echo xxx > set_ftrace_pid

通過cat trace可以得到結果，只提供了trace函數的調用者。

# tracer: function
#
# entries-in-buffer/entries-written: 5/5   #P:1
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
            adbd-1243  [000] d...  7436.183990: mod_timer <-DWC_TIMER_SCHEDULE
          <idle>-0     [000] ..s.  7437.264984: mod_timer <-br_hello_timer_expired
            adbd-1243  [000] d...  7438.000336: mod_timer <-DWC_TIMER_SCHEDULE
            adbd-1243  [000] d...  7438.000549: mod_timer <-DWC_TIMER_SCHEDULE
            adbd-1243  [000] d...  7438.003876: mod_timer <-DWC_TIMER_SCHEDULE

4.3.4 都有哪些函數可以跟蹤(available_filter_functions)?

ftrace_init初始化：

extern unsigned long __start_mcount_loc[];
extern unsigned long __stop_mcount_loc[];

void __init ftrace_init(void)
{
    unsigned long count, addr, flags;
    int ret;

    /* Keep the ftrace pointer to the stub */
    addr = (unsigned long)ftrace_stub;

    local_irq_save(flags);
    ftrace_dyn_arch_init(&addr);
    local_irq_restore(flags);

    /* ftrace_dyn_arch_init places the return code in addr */
    if (addr)
        goto failed;

    count = __stop_mcount_loc - __start_mcount_loc;

    ret = ftrace_dyn_table_alloc(count);
    if (ret)
        goto failed;

    last_ftrace_enabled = ftrace_enabled = 1;

    ret = ftrace_process_locs(NULL,
                  __start_mcount_loc,
                  __stop_mcount_loc);

    ret = register_module_notifier(&ftrace_module_exit_nb);
    if (ret)
        pr_warning("Failed to register trace ftrace module exit notifier\n");

    set_ftrace_early_filters();

    return;
 failed:
    ftrace_disabled = 1;
}

static int ftrace_process_locs(struct module *mod,
                   unsigned long *start,
                   unsigned long *end)
{
    struct ftrace_page *pg;
    unsigned long count;
    unsigned long *p;
    unsigned long addr;
    unsigned long flags = 0; /* Shut up gcc */
    int ret = -ENOMEM;

    count = end - start;

    if (!count)
        return 0;

    pg = ftrace_allocate_pages(count);
    if (!pg)
        return -ENOMEM;

    mutex_lock(&ftrace_lock);

    /*
     * Core and each module needs their own pages, as
     * modules will free them when they are removed.
     * Force a new page to be allocated for modules.
     */
    if (!mod) {
        WARN_ON(ftrace_pages || ftrace_pages_start);
        /* First initialization */
        ftrace_pages = ftrace_pages_start = pg;
    } else {
        if (!ftrace_pages)
            goto out;

        if (WARN_ON(ftrace_pages->next)) {
            /* Hmm, we have free pages? */
            while (ftrace_pages->next)
                ftrace_pages = ftrace_pages->next;
        }

        ftrace_pages->next = pg;
        ftrace_pages = pg;
    }

    p = start;
    while (p < end) {
        addr = ftrace_call_adjust(*p++);
        /*
         * Some architecture linkers will pad between
         * the different mcount_loc sections of different
         * object files to satisfy alignments.
         * Skip any NULL pointers.
         */
        if (!addr)
            continue;
        if (!ftrace_record_ip(addr))
            break;
    }

    /* These new locations need to be initialized */
    ftrace_new_pgs = pg;

    /* Make each individual set of pages sorted by ips */
    for (; pg; pg = pg->next)
        sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
             ftrace_cmp_recs, ftrace_swap_recs);

    /*
     * We only need to disable interrupts on start up
     * because we are modifying code that an interrupt
     * may execute, and the modification is not atomic.
     * But for modules, nothing runs the code we modify
     * until we are finished with it, and there's no
     * reason to cause large interrupt latencies while we do it.
     */
    if (!mod)
        local_irq_save(flags);
    ftrace_update_code(mod);
    if (!mod)
        local_irq_restore(flags);
    ret = 0;
 out:
    mutex_unlock(&ftrace_lock);

    return ret;
}

static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
{

    trace_create_file("available_filter_functions", 0444,
            d_tracer, NULL, &ftrace_avail_fops);

    trace_create_file("enabled_functions", 0444,
            d_tracer, NULL, &ftrace_enabled_fops);

    trace_create_file("set_ftrace_filter", 0644, d_tracer,
            NULL, &ftrace_filter_fops);

    trace_create_file("set_ftrace_notrace", 0644, d_tracer,
                    NULL, &ftrace_notrace_fops);

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    trace_create_file("set_graph_function", 0444, d_tracer,
                    NULL,
                    &ftrace_graph_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */

    return 0;
}

static const struct file_operations ftrace_avail_fops = {
    .open = ftrace_avail_open,
    .read = seq_read,
    .llseek = seq_lseek,
    .release = seq_release_private,
};



static int ftrace_avail_open(struct inode *inode, struct file *file)
{
    struct ftrace_iterator *iter;
    int ret;

    if (unlikely(ftrace_disabled))
        return -ENODEV;

    iter = kzalloc(sizeof(*iter), GFP_KERNEL);
    if (!iter)
        return -ENOMEM;

    iter->pg = ftrace_pages_start;
    iter->ops = &global_ops;

    ret = seq_open(file, &show_ftrace_seq_ops);
    if (!ret) {
        struct seq_file *m = file->private_data;

        m->private = iter;
    } else {
        kfree(iter);
    }

    return ret;
}

4.4 function_graph跟蹤器

function_graph和function類似，但是function跟蹤器只在函數入口點探測，而function_graph在函數入口和退出都進行探測。

function_graph提供了類似C語言函數調用關系圖，並且記錄了函數執行耗時。

echo function_graph > /sys/kernel/debug/tracing/current_tracer。

function_graph沒有設置pid，但是可以設置跟蹤哪些函數，不跟蹤那些函數：

echo xxx > set_graph_function

echo xxx > set_graph_notrace

設置function_graph嵌套層數：

echo 10 > max_graph_depth

獲得的Trace如下，細節以及調用關系更明確。同時可以獲得函數耗時，這對於性能優化非常重要，可以輕松找出熱點區域。

 0)               |  SyS_futex() {
 0)               |    do_futex() {
 0)               |      futex_wake() {
 0)               |        get_futex_key() {
 0)   0.045 us    |          get_futex_key_refs.isra.13();
 0)   0.337 us    |        }
 0)   0.040 us    |        hash_futex();
 0)   0.051 us    |        _raw_spin_lock();
 0)               |        mark_wake_futex() {
 0)   0.061 us    |          wake_q_add();
 0)   0.082 us    |          __unqueue_futex();
 0)   0.674 us    |        }
 0)               |        wake_up_q() {
...
 0) + 14.680 us   |        }
 0)   0.033 us    |        drop_futex_key_refs.isra.14();
 0) + 17.200 us   |      }
 0) + 17.495 us   |    }
 0) + 17.752 us   |  }

抓取數據之后，可以通過Python腳本進行簡單的轉換放到SourceInsight查看更方便。

import re

output_file = 'output.c' input_file = 'trace.txt' input = open(input_file, 'rb') output = open(output_file, 'wb') for line in input: m = re.match('^ (?P<cpu>[0-9]).{3}(?P<duration>.*) *\| (?P<message>.*)', line) if(not m): continue #print m.group('cpu'), m.group('duration'), m.group('message') output.write(m.group('message') + "//" + m.group('duration') + "\n")

4.5 irqsoff/preemptoff/preemptirqsoff跟蹤器

4.5.1 中斷屏蔽和強占禁止帶來的危害

中斷屏蔽和強占禁止非常影響系統性能，所以對中斷屏蔽和強占禁止進行統計監控，發現異常點很有必要。

當中斷被屏蔽后，CPU無法響應外部事件(除了不可屏蔽中斷NMI和系統管理中斷SMI)。這就會阻止比如系統Tick中斷或者鍵盤中斷，導致響應時間變長。

同樣強占禁止，我們還可以收到中斷，但是任務強占被禁止導致更高優先級的任務得不到調度，直到強占被再次允許。

4.5.2 設置跟蹤器

echo 0 > options/function-trace
echo irqsoff > current_tracer
echo 1 > tracing_on
echo 0 > tracing_max_latency
 [...]
echo 0 > tracing_on
cat trace

結果如下：

# tracer: irqsoff
#
# irqsoff latency trace v1.1.5 on 4.0.0+
# --------------------------------------------------------------------
# latency: 11658 us, #4/4, CPU#2 | (M:server VP:0, KP:0, SP:0 HP:0 #P:4)
#    -----------------
#    | task: swapper/2-0 (uid:0 nice:0 policy:0 rt_prio:0)
#    -----------------
#  => started at: rcu_idle_enter
#  => ended at:   arch_cpu_idle---------------------記錄禁止中斷時間最長的開始和結束函數
#
#
#                  _------=> CPU#            
#                 / _-----=> irqs-off----------------d表示中斷被disabled，'.'表示中斷沒有被關閉。
#                | / _----=> need-resched------------N-表示need_resched被設置；'.'-表示need_resched沒有被設置，中斷返回不會進行進程切換。
#                || / _---=> hardirq/softirq---------H-表示softirq中發生了硬件中斷；h-硬件中斷；s-softirq；'.'-不在中斷上下文中。
#                ||| / _--=> preempt-depth-----------當搶占中斷勢能后，該域代表preempt_disabled的級別。
#                |||| /     delay            
#  cmd     pid   ||||| time  |   caller      ---------cmd-進程名，pid-進程id，time-表示trace從開始到當前的相對時間，delay-突出顯示那些有高延遲的地方以便引起注意。！表示需要引起注意。
#     \   /      |||||  \    |   /         
  <idle>-0       2d...    3us#: rcu_idle_enter
  <idle>-0       2d... 11645us+: arch_cpu_idle
  <idle>-0       2d... 11665us+: trace_hardirqs_on <-arch_cpu_idle
  <idle>-0       2d... 11753us : <stack trace>
 => cpu_startup_entry
 => secondary_start_kernel

察看禁止中斷最長函數：

static void cpuidle_idle_call(void)
{
...
    rcu_idle_enter();------------------屏蔽中斷
...
    if (current_clr_polling_and_test())
        local_irq_enable();
    else
        arch_cpu_idle();---------------開中斷
...
}

4.6 wakeup/wakeup_rt/wakeup_dl跟蹤器

wakeup類調度器記錄調度延時，也即從系統被喚醒到被調度到的延時。顯示的結果類似irqsoff跟蹤器。

echo 0 > options/function-trace
echo wakeup > current_tracer
echo 1 > tracing_on
echo 0 > tracing_max_latency
chrt -f 5 sleep 1
echo 0 > tracing_on
cat trace

wakeup：顯示進程從woken到wake up的延時，包括所有進程。

wakeup_dl：顯示SCHED_DEADLINE類型調度延時。

wakeup_rt：顯示實時進程的調度延時。

4.7 stack跟蹤器

stack跟蹤器用於追蹤內核棧的使用情況，它記錄了每個內核對棧的使用情況。

stack跟蹤器比較特殊，它的使能不在tracing目錄：

       Depth    Size   Location    (-1 entries)
        -----    ----   --------
#
#  Stack tracer disabled
#
# To enable the stack tracer, either add 'stacktrace' to the
# kernel command line
# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'
#

然后通過/sys/kernel/debug/tracing/stack_trace可以查看堆棧軌跡。

        Depth    Size   Location    (13 entries)
        -----    ----   --------
  0)     1952       4   msecs_to_jiffies+0x14/0x34-----------------------一共1952個字節
  1)     1948      76   update_group_capacity+0x2c/0x2b4
  2)     1872     256   find_busiest_group+0x10c/0x904
  3)     1616     168   load_balance+0x170/0x7b0
  4)     1448     104   pick_next_task_fair+0x1a8/0x500
  5)     1344      28   __schedule+0x100/0x5b8
  6)     1316      68   schedule+0x4c/0xa4
  7)     1248     104   schedule_hrtimeout_range_clock+0x154/0x15c
  8)     1144      16   schedule_hrtimeout_range+0x1c/0x20
  9)     1128      24   poll_schedule_timeout+0x48/0x74
 10)     1104     968   do_sys_poll+0x3fc/0x4b8-------------------------使用了最大的棧空間
 11)      136      40   SyS_poll+0xc4/0x108
 12)       96      96   ret_fast_syscall+0x0/0x4c

4.8 其他Tracer

mmiotrace：Memory mapped IO

blk：

4.9 開機使用ftrace

存在某些情況，需要盡量早的啟動ftrace功能。

這時候就需要修改command line，在其中打開ftrace相關設置。

具體的設置有trace_event、trace_buf_size、ftrace、ftrace_notrace、ftrace_filter、ftrace_graph_filter、stacktrace、ftraceftrace_dump_on_oops、tracing_thresh。

ftrace用於設置tracer，trace_buf_size設置ring buffer大小，trace_event設置跟蹤哪些events，ftrace_notrace/ftrace_filter/ftrace_graph_filter都是設置過濾器。

4.9.1 trace_event設置trace events

在開機的時候設置需要跟蹤的trace events，將內容放入bootup_event_buf。

通常格式如下，以逗號作為分隔符：

trace_event=sched:sched_process_fork,irq:,thermal

然后在event_trace_init()中根據bootup_event_buf進行設置。

static __init int setup_trace_event(char *str)
{
    strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
    ring_buffer_expanded = 1;
    tracing_selftest_disabled = 1;

    return 1;
}
__setup("trace_event=", setup_trace_event);

static __init int event_trace_init(void)
{
    struct ftrace_event_call **call;
    struct dentry *d_tracer;
    struct dentry *entry;
    struct dentry *d_events;
    int ret;
    char *buf = bootup_event_buf;
    char *token;
...
    while (true) {
        token = strsep(&buf, ",");

        if (!token)
            break;
        if (!*token)
            continue;

        ret = ftrace_set_clr_event(token, 1);
        if (ret)
            pr_warning("Failed to enable trace event: %s\n", token);
    }
...
    return 0;
}

4.9.2 ftrace相關設置

在command line中設置ftrace可以在開機時啟動指定tracer。

指定tracer在register_tracer()中進行。

static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;

static int __init set_cmdline_ftrace(char *str)
{
    strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
    default_bootup_tracer = bootup_tracer_buf;------------------------------------------指定tracer的名稱 /* We are using ftrace early, expand it */
    ring_buffer_expanded = 1;
    return 1;
}
__setup("ftrace=", set_cmdline_ftrace);

int register_tracer(struct tracer *type)
__releases(kernel_lock)
__acquires(kernel_lock)
{
    struct tracer *t;
    int ret = 0;
...
 out:
    tracing_selftest_running = false;
    mutex_unlock(&trace_types_lock);

    if (ret || !default_bootup_tracer)
        goto out_unlock;

    if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))--------------------比較當前注冊的tracer是否和ftrace設置的。 goto out_unlock;

    printk(KERN_INFO "Starting tracer '%s'\n", type->name);
    /* Do we want this tracer to start on bootup? */
    tracing_set_tracer(type->name);------------------------------------------------------在注冊的時候就指定tracer。
    default_bootup_tracer = NULL;
    /* disable other selftests, since this will break it. */
    tracing_selftest_disabled = 1;
#ifdef CONFIG_FTRACE_STARTUP_TEST
    printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
           type->name);
#endif

 out_unlock:
    return ret;
}

5 ftrace相關工具

5.1 trace-cmd和kernelshark

請參照：《ftrace利器之trace-cmd和kernelshark》。

6 ftrace實現的原理

打開對function、function_graph的支持，導致在編譯時插入的一段代碼。

然后在echo function/function_graph > current_tracer，在運行時將代碼進行替換的操作。

詳細解釋編譯、動態開關等ftrace相關原理和流程。

《ftrace function_graph分析》

6.0 GCC -pg選項

對比打開-pg和不打開匯編代碼，查看區別，以cpu_down為例。分別查看不同選項下的反匯編。

未開function/function_graph tracer

僅開function tracer

開function/function_graph tracer

0000038c <cpu_down>:
38c: e92d4010 push {r4, lr}
390: e1a04000 mov r4, r0
394: e3000000 movw r0, #0
398: e3400000 movt r0, #0
39c: ebfffffe bl 0 <mutex_lock>
3a0: e3003000 movw r3, #0
3a4: e3403000 movt r3, #0
3a8: e5931004 ldr r1, [r3, #4]
3ac: e3510000 cmp r1, #0
3b0: 1a000007 bne 3d4 <cpu_down+0x48>
3b4: e1a00004 mov r0, r4
3b8: ebffff43 bl cc <_cpu_down>
3bc: e1a04000 mov r4, r0
3c0: e3000000 movw r0, #0
3c4: e3400000 movt r0, #0
3c8: ebfffffe bl 0 <mutex_unlock>
3cc: e1a00004 mov r0, r4
3d0: e8bd8010 pop {r4, pc}
3d4: e3e0400f mvn r4, #15
3d8: eafffff8 b 3c0 <cpu_down+0x34>

000003c0 <cpu_down>:
3c0: e92d4010 push {r4, lr}
3c4: e52de004 push {lr} ; (str lr, [sp, #-4]!)
3c8: ebfffffe bl 0 <__gnu_mcount_nc>
3cc: e1a04000 mov r4, r0
3d0: e3000000 movw r0, #0
3d4: e3400000 movt r0, #0
3d8: ebfffffe bl 0 <mutex_lock>
3dc: e3003000 movw r3, #0
3e0: e3403000 movt r3, #0
3e4: e5931004 ldr r1, [r3, #4]
3e8: e3510000 cmp r1, #0
3ec: 1a000007 bne 410 <cpu_down+0x50>
3f0: e1a00004 mov r0, r4
3f4: ebffff3e bl f4 <_cpu_down>
3f8: e1a04000 mov r4, r0
3fc: e3000000 movw r0, #0
400: e3400000 movt r0, #0
404: ebfffffe bl 0 <mutex_unlock>
408: e1a00004 mov r0, r4
40c: e8bd8010 pop {r4, pc}
410: e3e0400f mvn r4, #15
414: eafffff8 b 3fc <cpu_down+0x3c>

00000400 <cpu_down>:
400: e1a0c00d mov ip, sp
404: e92dd830 push {r4, r5, fp, ip, lr, pc}
408: e24cb004 sub fp, ip, #4
40c: e52de004 push {lr} ; (str lr, [sp, #-4]!)
410: ebfffffe bl 0 <__gnu_mcount_nc>
414: e1a04000 mov r4, r0
418: e3000000 movw r0, #0
41c: e3400000 movt r0, #0
420: ebfffffe bl 0 <mutex_lock>
424: e3003000 movw r3, #0
428: e3403000 movt r3, #0
42c: e5931004 ldr r1, [r3, #4]
430: e3510000 cmp r1, #0
434: 1a000007 bne 458 <cpu_down+0x58>
438: e1a00004 mov r0, r4
43c: ebffff3a bl 12c <_cpu_down>
440: e1a04000 mov r4, r0
444: e3000000 movw r0, #0
448: e3400000 movt r0, #0
44c: ebfffffe bl 0 <mutex_unlock>
450: e1a00004 mov r0, r4
454: e89da830 ldm sp, {r4, r5, fp, sp, pc}
458: e3e0400f mvn r4, #15
45c: eafffff8 b 444 <cpu_down+0x44>

__gnu_mcount_nc定義如下：

ENTRY(__gnu_mcount_nc)
#ifdef CONFIG_DYNAMIC_FTRACE
    mov    ip, lr
    ldmia    sp!, {lr}
    mov    pc, ip
#else
    __mcount
#endif
ENDPROC(__gnu_mcount_nc)

6.1 打開哪些選項才能實現ftrace功能？

可以看出在定義了CONFIG_FUNCTION_TRACER、CONFIG_DYNAMIC_FTRACE之后就具備了recordmcount的功能。

如果再定義CONFIG_HAVE_C_RECORDMCOUNT，那么就會使用recordmcount.c而不是recordmcount.pl來進行mcount處理。

.config中定義：

CONFIG_FUNCTION_TRACER=y
CONFIG_HAVE_C_RECORDMCOUNT=y
CONFIG_DYNAMIC_FTRACE=y
CONFIG_FTRACE_MCOUNT_RECORD=y

Makefile中定義：

ifdef CONFIG_FUNCTION_TRACER
ifndef CC_FLAGS_FTRACE
CC_FLAGS_FTRACE := -pg endif export CC_FLAGS_FTRACE ifdef CONFIG_HAVE_FENTRY CC_USING_FENTRY := $(call cc-option, -mfentry -DCC_USING_FENTRY) endif KBUILD_CFLAGS += $(CC_FLAGS_FTRACE) $(CC_USING_FENTRY) KBUILD_AFLAGS += $(CC_USING_FENTRY) ifdef CONFIG_DYNAMIC_FTRACE ifdef CONFIG_HAVE_C_RECORDMCOUNT  BUILD_C_RECORDMCOUNT := y  export BUILD_C_RECORDMCOUNT endif endif endif

Makefile.build中定義：

ifdef CONFIG_FTRACE_MCOUNT_RECORD
ifdef BUILD_C_RECORDMCOUNT
ifeq ("$(origin RECORDMCOUNT_WARN)", "command line")
  RECORDMCOUNT_FLAGS = -w
endif
# Due to recursion, we must skip empty.o.
# The empty.o file is created in the make process in order to determine
#  the target endianness and word size. It is made before all other C
#  files, including recordmcount.
sub_cmd_record_mcount =                    \
    if [ $(@) != "scripts/mod/empty.o" ]; then    \
        $(objtree)/scripts/recordmcount $(RECORDMCOUNT_FLAGS) "$(@)";    \
    fi;
recordmcount_source := $(srctree)/scripts/recordmcount.c \
            $(srctree)/scripts/recordmcount.h
else...
endif
...
# Built-in and composite module parts
$(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE
    $(call cmd,force_checksrc)
    $(call if_changed_rule,cc_o_c)

# Single-part modules are special since we need to mark them in $(MODVERDIR)

$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE
    $(call cmd,force_checksrc)
    $(call if_changed_rule,cc_o_c)
    @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod)

6.2 ftrace的mcount功能是如何實現的？

在Documentation/trace/ftrace.txt中有一段解釋：

If CONFIG_DYNAMIC_FTRACE is set, the system will run with virtually no overhead when function tracing is disabled.
The way this works is the mcount function call (placed at the start of every kernel function, produced by the -pg switch in gcc),
starts of pointing to a simple return. (Enabling FTRACE will include the -pg switch in the compiling of the kernel.)

At compile time every C file object is run through the recordmcount program (located in the scripts directory).
This program will parse the ELF headers in the C object to find all
the locations in the .text section that call mcount.
(Note, only white listed .text sections are processed, 
since processing other sections like .init.text may cause races due to those sections being freed unexpectedly).

A new section called "__mcount_loc" is created that holds references to all the mcount call sites in the .text section.
The recordmcount program re-links this section back into the original object.
The final linking stage of the kernel will add all these references into a single table.

在c文件編譯完之后，recordmcount增加一個__mcount_loc段。

在vmlinux.lds.h文件中對__mcount_loc段歸集，在系統初始化的時候有兩個參數很重要__start_mcount_loc和__stop_mcount_loc。

在available_function

#define MCOUNT_REC()    . = ALIGN(8);                \
            VMLINUX_SYMBOL(__start_mcount_loc) = .; \
            *(__mcount_loc)                \
            VMLINUX_SYMBOL(__stop_mcount_loc) = .;



/* init and exit section handling */
#define INIT_DATA                            \
    *(.init.data)                            \
...
    KERNEL_CTORS()                            \
    *(.init.rodata)                            \
    MCOUNT_REC()                            \
...
    KERNEL_DTB()


#define INIT_DATA_SECTION(initsetup_align)                \
    .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {        \
        INIT_DATA                        \
...
    }

6.3 引入ftrace對性能的影響有多大？

在不使用的時候在入口點插入nop，在使用的時候才會替換成

On boot up, before SMP is initialized, the dynamic ftrace code scans this table and updates all the locations into nops.
It also records the locations, which are added to the available_filter_functions list.
在啟動階段，SMP初始化之前，ftrace掃描__mcount_loc段，將所有入口地址mcount使用nop替代。這樣只要不打開，開銷非常小，基本上不產生性能影響。

Modules are processed as they are loaded and before they are executed.
When a module is unloaded, it also removes its functions from the ftrace function list.
This is automatic in the module unload code, and the module author does not need to worry about it.

When tracing is enabled, the process of modifying the function tracepoints is dependent on architecture.
...
The new method of modifying the function tracepoints is to place a breakpoint at the location to be modified,
sync all CPUs, modify the rest of the instruction not covered by the breakpoint.
Sync all CPUs again, and then remove the breakpoint with the finished version to the ftrace call site.

在內核初始化的初期，ftrace 查詢 __mcount_loc 段，得到每個函數的入口地址，並將 mcount 替換為 nop 指令。這樣在默認情況下，ftrace 不會對內核性能產生影響。

當用戶打開 ftrace 功能時，ftrace 將這些 nop 指令動態替換為 ftrace_caller，該函數將調用用戶注冊的 trace 函數。

6.4 核心函數ftrace_caller/ftrace_graph_caller

#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(ftrace_caller)
UNWIND(.fnstart)
    __ftrace_caller
UNWIND(.fnend)
ENDPROC(ftrace_caller)
#endif


.macro __ftrace_caller suffix
    mcount_enter------------------------------------宏mcount_enter

    mcount_get_lr    r1            @ lr of instrumented func
    mcount_adjust_addr    r0, lr        @ instrumented function

    .globl ftrace_call\suffix
ftrace_call\suffix:
    bl    ftrace_stub

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    .globl ftrace_graph_call\suffix
ftrace_graph_call\suffix:
    mov    r0, r0
#endif mcount_exit------------------------------------宏mcount_exit
.endm

.macro mcount_enter----------------------------------mcount_enter定義
stmdb sp!, {r0-r3, lr}
.endm

.macro mcount_get_lr reg
ldr \reg, [fp, #-4]
.endm

.macro mcount_exit---------------------------------mcount_exit定義
ldr lr, [fp, #-4]
ldmia sp!, {r0-r3, pc}
.endm

function_graph：

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
UNWIND(.fnstart)
    __ftrace_graph_caller
UNWIND(.fnend)
ENDPROC(ftrace_graph_caller)
#endif

.macro __ftrace_graph_caller
    sub    r0, fp, #4        @ &lr of instrumented routine (&parent)
#ifdef CONFIG_DYNAMIC_FTRACE
    @ called from __ftrace_caller, saved in mcount_enter
    ldr    r1, [sp, #16]        @ instrumented routine (func)
    mcount_adjust_addr    r1, r1
#else
    @ called from __mcount, untouched in lr
    mcount_adjust_addr    r1, lr    @ instrumented routine (func)
#endif
    mov    r2, fp            @ frame pointer
    bl    prepare_ftrace_return
    mcount_exit
.endm

參考文檔：

1. The Linux Kernel Tracepoint API

2. Linux內核跟蹤之trace框架分析

3. linux ftrace追蹤一（基本技術結構粗略剖析）

4. 使用 ftrace 調試 Linux 內核，第 1 部分 & 使用 ftrace 調試 Linux 內核，第 2 部分 & 使用 ftrace 調試 Linux 內核，第 3 部分

5. Linux Tracepoint內核文檔

6. ftrace簡介

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Linux ftrace框架介紹及運用【轉】 Linux應用編程框架介紹使用ftrace學習linux內核函數調用 .NET之Dapper框架運用 ftrace 詳解 MyBatis常用對象SqlSessionFactory和SqlSession介紹和運用 MyBatis常用對象SqlSessionFactory和SqlSession介紹和運用 Flask 框架：運用Ajax實現數據交互 ftrace(一)原理簡介 Executor框架（一）Executor框架介紹