Linux內存管理 (5)slab分配器


專題:Linux內存管理專題

關鍵詞:slab/slub/slob、slab描述符、kmalloc、本地/共享對象緩沖池、slabs_partial/slabs_full/slabs_free、avail/limit/batchcount

 

Linux內存管理框架圖可以知道:slab/slub/slob都是基於伙伴系統。

伙伴系統是以page為單位進行操作的。但是很多場景並不需要如此大的內存分配,slab就是用在這種場景的。

本章節主要內容:從slab相關數據結構講起,對slab有一個靜態的認識;然后介紹slab從創建描述符->分配緩存->釋放緩存->銷毀描述符介紹整個slab生命周;最后介紹基於slab分配器的kmalloc的運行原理。

slab分配器最終還是由伙伴系統來分配出實際的物理頁面,只不過slab分配器在這些連續的物理頁面上實現了自己的算法,以此來對小內存塊進行管理。

slab分配器相關重要函數有:

struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,---------創建slab描述符kmem_cache,此時並沒有真正分配內存
            unsigned long, void (*)(void *));
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);------------------分配slab緩存對象
void kmem_cache_free(struct kmem_cache *, void *);-------------------------釋放slab緩存對象
void kmem_cache_destroy(struct kmem_cache *);-----------------------------銷毀slab描述符

 

每個slab由多少個頁面組成?

每個slab由一個或多個連續頁面組成,最低一個,物理連續。

 

slab需要的物理內存在什么時候分配?

首先kmem_cache_create是並不分配頁面,等到kmem_cache_alloc時才有可能分配頁面。首先從本地緩沖池和共享緩沖池、三大鏈表都沒有空閑對象時,才會去分配2^gfporder個頁面,然后掛入到slabs_free中。

 

slab描述符中空閑對象過多,是否要回收?

有兩種方式回收空閑對象:

(1)使用kmem_cache_free釋放對象,當本地和共享對象緩沖池中空閑對象數目ac->avail大於ac->limit時,系統會主動脂肪batchcount個對象。當所有空閑數目大於系統空閑對象數目極限值,並且slab沒有活躍對象時,可以銷毀此slab,回收內存。

(2)系統注冊了delayed_work,定時掃描slab描述符,回收一部分空閑對象,在cache_reap中實現。

 

slab的cache colour着色區作用?

使不同slab上同一個相對位置slab對象的起始地址在高速緩存中相互錯開,有利於改善高速緩存的行能。

另一個利用cache場景是Per-CPU類型本地對象緩沖池。兩個優點:讓一個對象盡可能地運行在同一個CPU上;訪問Per-CPU類型本地對象緩沖池不需要獲取額外自選鎖。

 

 

 

1. slab相關數據結構

slab對象的描述符struct kmem_cache:

struct kmem_cache {
    struct array_cache __percpu *cpu_cache;

/* 1) Cache tunables. Protected by slab_mutex */
    unsigned int batchcount;-----------------------------------表示當前CPU本地緩沖池array_cache為空時,從共享緩沖池或者slabs_partial/slabs_free列表中獲取對象的數目。
    unsigned int limit;----------------------------------------表示當本地對象緩沖池空閑對象數目大於limit時就會主動釋放batchcount個對象,便於內核回收和銷毀slab。
    unsigned int shared;

    unsigned int size;-----------------------------------------align過后的對象長度 struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */

    unsigned int flags;        /* constant flags */------------分配掩碼
    unsigned int num;        /* # of objs per slab */----------slab中有多少個對象

/* 3) cache_grow/shrink */
    /* order of pgs per slab (2^n) */
    unsigned int gfporder;------------------------------------此slab占用z^gfporder個頁面 /* force GFP flags, e.g. GFP_DMA */
    gfp_t allocflags;

    size_t colour;            /* cache colouring range */----一個slab有幾個不同的cache line
    unsigned int colour_off;    /* colour offset */----------一個cache order的長度,和L1 Cache Line長度相同
struct kmem_cache *freelist_cache; unsigned int freelist_size; /* constructor func */ void (*ctor)(void *obj); /* 4) cache creation/removal */ const char *name;----------------------------------------slab描述符的名稱 struct list_head list; int refcount;--------------------------------------------被引用的次數,供slab描述符銷毀參考 int object_size;-----------------------------------------對象的實際大小 int align;-----------------------------------------------對齊的大小 /* 5) statistics */ #ifdef CONFIG_DEBUG_SLAB unsigned long num_active; unsigned long num_allocations; unsigned long high_mark; unsigned long grown; unsigned long reaped; unsigned long errors; unsigned long max_freeable; unsigned long node_allocs; unsigned long node_frees; unsigned long node_overflow; atomic_t allochit; atomic_t allocmiss; atomic_t freehit; atomic_t freemiss; /* * If debugging is enabled, then the allocator can add additional * fields and/or padding to every object. size contains the total * object size including these internal fields, the following two * variables contain the offset to the user object and its size. */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ #ifdef CONFIG_MEMCG_KMEM struct memcg_cache_params memcg_params; #endif struct kmem_cache_node *node[MAX_NUMNODES];-------slab對應的節點的struct kmem_cache_node數據結構 }

 

本地CPU緩沖池struct array_cache:

struct array_cache {
    unsigned int avail;-------------對象緩沖池中可用的對象數目
    unsigned int limit;
    unsigned int batchcount;
    unsigned int touched;----------從緩沖池移除一個對象時,touched置1;收縮緩存時,touched置0。
    void *entry[];-----------------保存對象的實體
};

 

內存節點的slab列表:

/*
 * The slab lists for all objects.
 */
struct kmem_cache_node {
    spinlock_t list_lock;

#ifdef CONFIG_SLAB
    struct list_head slabs_partial;    /* partial list first, better asm code */----slab鏈表中部分對象空閑
    struct list_head slabs_full;----------------------------------------------------slab鏈表中沒有對象空閑 struct list_head slabs_free;----------------------------------------------------slab鏈表中所有對象空閑
    unsigned long free_objects;-----------------------------------------------------三個鏈表中空閑對象數目
    unsigned int free_limit;--------------------------------------------------------slab中可容許的空閑對象數目最大閾值。
    unsigned int colour_next;    /* Per-node cache coloring */
    struct array_cache *shared;    /* shared per node */----------------------------多核CPU公用的共享對象緩沖池
    struct alien_cache **alien;    /* on other nodes */
    unsigned long next_reap;    /* updated without locking */
    int free_touched;        /* updated without locking */
#endif

#ifdef CONFIG_SLUB
    unsigned long nr_partial;
    struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
    atomic_long_t nr_slabs;
    atomic_long_t total_objects;
    struct list_head full;
#endif
#endif

};

 

SLAB Flags

/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set.
 */
#define SLAB_DEBUG_FREE        0x00000100UL    /* DEBUG: Perform (expensive) checks on free */
#define SLAB_RED_ZONE        0x00000400UL    /* DEBUG: Red zone objs in a cache */
#define SLAB_POISON        0x00000800UL    /* DEBUG: Poison objects */
#define SLAB_HWCACHE_ALIGN    0x00002000UL    /* Align objs on cache lines */
#define SLAB_CACHE_DMA        0x00004000UL    /* Use GFP_DMA memory */
#define SLAB_STORE_USER        0x00010000UL    /* DEBUG: Store the last owner for bug hunting */
#define SLAB_PANIC        0x00040000UL    /* Panic if kmem_cache_create() fails */

 

2. 創建slab描述符

kmem_cache_create的最主要功能就是填充struct kmem_cache,主要參數有:

name:slab描述符的名稱

size:緩存對象的大小

align:對齊的大小

flags:分配掩碼

ctor:對象的構造函數

kmem_cache_create函數調用核心流程是:

kmem_cache_create-----------------------------進行合法性檢查,以及是否有現成slab描述符可用
    do_kmem_cache_create----------------------將主要參數配置到slab描述符,然后將得到的描述符加入slab_caches全局鏈表中。
        __kmem_cache_create-------------------是創建slab描述符的核心進行對齊操作,計算需要頁面,對象數目,對slab着色等等操作。
            calculate_slab_order--------------計算slab對象需要的大小,以及一個slab描述符需要多少page
            setup_cpu_cache-------------------繼續配置slab描述符

 

struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
          unsigned long flags, void (*ctor)(void *))
{
...
    s = __kmem_cache_alias(name, size, align, flags, ctor);----------------檢查是否有現成的slab描述符可用,有即跳轉到out_unlock。
    if (s)
        goto out_unlock;

    cache_name = kstrdup_const(name, GFP_KERNEL);
    if (!cache_name) {
        err = -ENOMEM;
        goto out_unlock;
    }

    s = do_kmem_cache_create(cache_name, size, size,----------------------調用do_kmem_cache_create創建slab描述符
                 calculate_alignment(flags, align, size),
                 flags, ctor, NULL, NULL);
...
    return s;
}

 

 

static struct kmem_cache * do_kmem_cache_create(const char *name, size_t object_size, size_t size,
             size_t align, unsigned long flags, void (*ctor)(void *),
             struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
    struct kmem_cache *s;
    int err;

    err = -ENOMEM;
    s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);-----------------------分配一個struct kmem_cache結構體
    if (!s)
        goto out;

    s->name = name;
    s->object_size = object_size;
    s->size = size;
    s->align = align;
    s->ctor = ctor;-----------------------------------------------------將參數填入struct kmem_cache結構體
...
    err = __kmem_cache_create(s, flags);-------------------------------
...
    s->refcount = 1;
    list_add(&s->list, &slab_caches);----------------------------------將創建的slab描述符加入到全局變量slab_caches中 ...
}

 

__kmem_cache_create是創建slab描述符的核心:

int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
    size_t left_over, freelist_size;
    size_t ralign = BYTES_PER_WORD;
    gfp_t gfp;
    int err;
    size_t size = cachep->size;
...
    /*
     * Check that size is in terms of words.  This is needed to avoid
     * unaligned accesses for some archs when redzoning is used, and makes
     * sure any on-slab bufctl's are also correctly aligned.
     */
    if (size & (BYTES_PER_WORD - 1)) {
        size += (BYTES_PER_WORD - 1);
        size &= ~(BYTES_PER_WORD - 1);----------------------4字節對齊
    }

    if (flags & SLAB_RED_ZONE) {
        ralign = REDZONE_ALIGN;
        /* If redzoning, ensure that the second redzone is suitably
         * aligned, by adjusting the object size accordingly. */
        size += REDZONE_ALIGN - 1;
        size &= ~(REDZONE_ALIGN - 1);
    }

    /* 3) caller mandated alignment */
    if (ralign < cachep->align) {
        ralign = cachep->align;
    }
    /* disable debug if necessary */
    if (ralign > __alignof__(unsigned long long))
        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
    /*
     * 4) Store it.
     */
    cachep->align = ralign;------------------------------對齊大小設置到struct kmem_cache if (slab_is_available())-----------------------------slab_state>=UP時,可以使用GFP_KERNEL分配,否則只能使用GFP_NOWAIT
        gfp = GFP_KERNEL;
    else
        gfp = GFP_NOWAIT;
...
    /*
     * Determine if the slab management is 'on' or 'off' slab.
     * (bootstrapping cannot cope with offslab caches so don't do
     * it too early on. Always use on-slab management when
     * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
     */
    if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
        !(flags & SLAB_NOLEAKTRACE))
        /*
         * Size is large, assume best to place the slab management obj
         * off-slab (should allow better packing of objs).
         */
        flags |= CFLGS_OFF_SLAB;

    size = ALIGN(size, cachep->align);------------------按照cachep->align對size進行對齊 /*
     * We should restrict the number of objects in a slab to implement
     * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
     */
    if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
        size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);

    left_over = calculate_slab_order(cachep, size, cachep->align, flags);

    if (!cachep->num)
        return -E2BIG;

    freelist_size = calculate_freelist_size(cachep->num, cachep->align);

    /*
     * If the slab has been placed off-slab, and we have enough space then
     * move it on-slab. This is at the expense of any extra colouring.
     */
    if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
        flags &= ~CFLGS_OFF_SLAB;
        left_over -= freelist_size;
    }

    if (flags & CFLGS_OFF_SLAB) {
        /* really off slab. No need for manual alignment */
        freelist_size = calculate_freelist_size(cachep->num, 0);

#ifdef CONFIG_PAGE_POISONING
        /* If we're going to use the generic kernel_map_pages()
         * poisoning, then it's going to smash the contents of
         * the redzone and userword anyhow, so switch them off.
         */
        if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
            flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
    }

    cachep->colour_off = cache_line_size();----------------------------------------L1 Cache line大小,由CONFIG_ARM_L1_CACHE_SHIFT配置,此處為64B。 /* Offset must be a multiple of the alignment. */
    if (cachep->colour_off < cachep->align)
        cachep->colour_off = cachep->align;
    cachep->colour = left_over / cachep->colour_off;
    cachep->freelist_size = freelist_size;
    cachep->flags = flags;
    cachep->allocflags = __GFP_COMP;
    if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
        cachep->allocflags |= GFP_DMA;
    cachep->size = size;
    cachep->reciprocal_buffer_size = reciprocal_value(size);

    if (flags & CFLGS_OFF_SLAB) {
        cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
        /*
         * This is a possibility for one of the kmalloc_{dma,}_caches.
         * But since we go off slab only for object size greater than
         * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
         * in ascending order,this should not happen at all.
         * But leave a BUG_ON for some lucky dude.
         */
        BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
    }

    err = setup_cpu_cache(cachep, gfp);-------------------------------------根據slab_state狀態進行不同處理,計算limit/batchcount,分配本地對象緩沖池,共享對象緩沖池 if (err) {
        __kmem_cache_shutdown(cachep);
        return err;
    }

    return 0;
}

 

slab_state用於表示slab分配器的狀態:

/*
 * State of the slab allocator.
 *
 * This is used to describe the states of the allocator during bootup.
 * Allocators use this to gradually bootstrap themselves. Most allocators
 * have the problem that the structures used for managing slab caches are
 * allocated from slab caches themselves.
 */
enum slab_state {
    DOWN,            /* No slab functionality yet */
    PARTIAL,        /* SLUB: kmem_cache_node available */
    PARTIAL_NODE,        /* SLAB: kmalloc size for node struct available */
    UP,            /* Slab caches usable but not all extras yet */
    FULL            /* Everything is working */------------------------完全初始化
};

 

calculate_slab_order計算slab的大小,返回值是page order。同時也計算此slab中可以容納多少個同樣大小的對象。

static size_t calculate_slab_order(struct kmem_cache *cachep,
            size_t size, size_t align, unsigned long flags)
{
    unsigned long offslab_limit;
    size_t left_over = 0;
    int gfporder;

    for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {------從gfporder=0開始,直到KMALLOC_MAX_ORDER=10,即從4KB到4MB大小。
        unsigned int num;
        size_t remainder;

        cache_estimate(gfporder, size, align, flags, &remainder, &num);
        if (!num)---------------------------------------------------------不等於0則表示gfporder已經滿足條件,最低分配到一個size大小的對象。等於0則繼續下一次for循環。 continue;

        /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
        if (num > SLAB_OBJ_MAX_NUM)--------------------------------------slab中對象最大數目,SLAB_OBJ_MAX_NUM為255,所以所有的slab對象不超過255 break;

        if (flags & CFLGS_OFF_SLAB) {
            size_t freelist_size_per_obj = sizeof(freelist_idx_t);
            /*
             * Max number of objs-per-slab for caches which
             * use off-slab slabs. Needed to avoid a possible
             * looping condition in cache_grow().
             */
            if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
                freelist_size_per_obj += sizeof(char);
            offslab_limit = size;
            offslab_limit /= freelist_size_per_obj;

             if (num > offslab_limit)
                break;
        }

        /* Found something acceptable - save it away */
        cachep->num = num;
        cachep->gfporder = gfporder;
        left_over = remainder;-------------------------------------------確定對象個數和需要的頁面數 ...
 if (left_over * 8 <= (PAGE_SIZE << gfporder))-------------------滿足着色條件,退出for循環。 break;
}
return left_over;
}

cache_eastimate根據當前大小2^gfporder來計算可以容納多少個對象,以及剩下多少空間用於着色。

static void cache_estimate(unsigned long gfporder, size_t buffer_size,
               size_t align, int flags, size_t *left_over,
               unsigned int *num)
{
    int nr_objs;
    size_t mgmt_size;
    size_t slab_size = PAGE_SIZE << gfporder;
...
    if (flags & CFLGS_OFF_SLAB) {
        mgmt_size = 0;
        nr_objs = slab_size / buffer_size;

    } else {
        nr_objs = calculate_nr_objs(slab_size, buffer_size,--------------可以容納對象數 sizeof(freelist_idx_t), align);
        mgmt_size = calculate_freelist_size(nr_objs, align);
    }
    *num = nr_objs;
    *left_over = slab_size - nr_objs*buffer_size - mgmt_size;------------除去對象大小和管理slab額外開銷外,剩余部分
}

 

3. 分配slab對象

kmem_cache_alloc是slab分配緩存對象的核心函數,在slab分配緩存過程中是全程關閉本地中斷的。

kmem_cache_alloc-->slab_alloc-->__do_cache_alloc是關中斷的。

static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
...
    local_irq_save(save_flags);
    objp = __do_cache_alloc(cachep, flags);-------------------------全程關本地中斷
    local_irq_restore(save_flags);
...
}

由於沒有定義NUMA,所以__do_cache_alloc就僅通過___cache_alloc來分配緩存。

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *objp;
    struct array_cache *ac;
    bool force_refill = false;

    check_irq_off();

    ac = cpu_cache_get(cachep);----------------------------------------獲取本地對象緩沖池 if (likely(ac->avail)) {-------------------------------------------本地對象緩沖池是否有空閑對象
        ac->touched = 1;
        objp = ac_get_obj(cachep, ac, flags, false);-------------------從本地對象緩沖池中分配一個對象 /*
         * Allow for the possibility all avail objects are not allowed
         * by the current flags
         */
        if (objp) {
            STATS_INC_ALLOCHIT(cachep);
            goto out;-------------------------------------------------如果成功獲得objp,那么直接返回指針。
        }
        force_refill = true;
    }

    STATS_INC_ALLOCMISS(cachep);
    objp = cache_alloc_refill(cachep, flags, force_refill);------------是slab分配緩存的核心 /*
     * the 'ac' may be updated by cache_alloc_refill(),
     * and kmemleak_erase() requires its correct value.
     */
    ac = cpu_cache_get(cachep);

out:
    /*
     * To avoid a false negative, if an object that is in one of the
     * per-CPU caches is leaked, we need to make sure kmemleak doesn't
     * treat the array pointers as a reference to the object.
     */
    if (objp)
        kmemleak_erase(&ac->entry[ac->avail]);
    return objp;
}

cache_alloc_refill是slab分配緩存的核心:

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
                            bool force_refill)
{
    int batchcount;
    struct kmem_cache_node *n;
    struct array_cache *ac;
    int node;

    check_irq_off();
    node = numa_mem_id();
    if (unlikely(force_refill))
        goto force_grow;
retry:
    ac = cpu_cache_get(cachep);-----------------------------------------獲取本地對象緩沖池ac
    batchcount = ac->batchcount;
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
        /*
         * If there was little recent activity on this cache, then
         * perform only a partial refill.  Otherwise we could generate
         * refill bouncing.
         */
        batchcount = BATCHREFILL_LIMIT;
    }
    n = get_node(cachep, node);-----------------------------------------找到對應的slab節點

    BUG_ON(ac->avail > 0 || !n);
    spin_lock(&n->list_lock);

    /* See if we can refill from the shared array */
    if (n->shared && transfer_objects(ac, n->shared, batchcount)) {----判斷共享對象緩沖池(n->shared)是否有空想對象。tansfer_objects嘗試遷移batchcount個空閑對象到ac中。
        n->shared->touched = 1;
        goto alloc_done;
    }

    while (batchcount > 0) {---------------------------------嘗試從slabs_partial/slabs_free中分配對象 struct list_head *entry;
        struct page *page;
        /* Get slab alloc is to come from. */
        entry = n->slabs_partial.next;
        if (entry == &n->slabs_partial) {
            n->free_touched = 1;
            entry = n->slabs_free.next;
            if (entry == &n->slabs_free)
                goto must_grow;-----------------------------如果slabs_partial/slabs_free都為空,則跳到must_grow分配對象。
        }

        page = list_entry(entry, struct page, lru);
        check_spinlock_acquired(cachep);

        /*
         * The slab was either on partial or free list so
         * there must be at least one object available for
         * allocation.
         */
        BUG_ON(page->active >= cachep->num);

        while (page->active < cachep->num && batchcount--) {
            STATS_INC_ALLOCED(cachep);
            STATS_INC_ACTIVE(cachep);
            STATS_SET_HIGH(cachep);

            ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
                                    node));---------------------ac_put_obj將slab_get_obj獲取到的對象遷移到ac中。
        }

        /* move slabp to correct slabp list: */
        list_del(&page->lru);
        if (page->active == cachep->num)------------------------將獲取到的slab掛到合適的鏈表。
            list_add(&page->lru, &n->slabs_full);
        else
            list_add(&page->lru, &n->slabs_partial);
    }

must_grow:
    n->free_objects -= ac->avail;
alloc_done:
    spin_unlock(&n->list_lock);

    if (unlikely(!ac->avail)) {--------------------------------ac->avail為0表示從共享對象緩沖池、slabs_free/slabs_partial都失敗了。 int x;
force_grow:
        x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);---在cachep中創建一個slab,並掛到slabs_free鏈表中。 /* cache_grow can reenable interrupts, then ac could change. */
        ac = cpu_cache_get(cachep);
        node = numa_mem_id();

        /* no objects in sight? abort */
        if (!x && (ac->avail == 0 || force_refill))
            return NULL;

        if (!ac->avail)        /* objects refilled by interrupt? */
            goto retry;
    }
    ac->touched = 1;

    return ac_get_obj(cachep, ac, flags, force_refill);
}

 

4. 釋放slab對象

slab釋放對象通過kmem_cache_free進行,在釋放過程中也是全程關中斷的。

一個slab描述符中可能有多個對象,因此釋放對象需要兩個參數才能確定釋放內容。

void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
    unsigned long flags;
    cachep = cache_from_obj(cachep, objp);-----------------------------通過對象找到slab描述符 if (!cachep)
        return;

    local_irq_save(flags);
    debug_check_no_locks_freed(objp, cachep->object_size);
    if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
        debug_check_no_obj_freed(objp, cachep->object_size);
    __cache_free(cachep, objp, _RET_IP_);-------------------------------關本地中斷
    local_irq_restore(flags);

    trace_kmem_cache_free(_RET_IP_, objp);
}

cache_from_obj通過要釋放對象虛擬地址,找到所在頁面,繼而找到對應的struct kmem_cache結構體。

然后將轉換得到的slab描述符和入參描述符對比,即可判斷兩者是否有效。

static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
...
    page = virt_to_head_page(x);-----------由virt_to_page找到對應的page,再找到first_page。
    cachep = page->slab_cache;-------------first_page中有指向slab描述符的slab_cache if (slab_equal_or_root(cachep, s))-----判斷兩者是否吻合 return cachep;
...
    return s;
}

__cache_free是釋放slab對象的核心:

首先通過slab描述符找到本地對象緩沖池;

然后判斷ac->avail和ac->limit大小,如果avail超過limit,則需要cache_flusharray去回收空閑對象;

最后ac_put_obj將對象釋放到本地對象緩沖池ac中,釋放過程結束。

static inline void __cache_free(struct kmem_cache *cachep, void *objp,
                unsigned long caller)
{
    struct array_cache *ac = cpu_cache_get(cachep);----------------找到本地對象緩沖池

    check_irq_off();
    kmemleak_free_recursive(objp, cachep->flags);
    objp = cache_free_debugcheck(cachep, objp, caller);

    kmemcheck_slab_free(cachep, objp, cachep->object_size);

    /*
     * Skip calling cache_free_alien() when the platform is not numa.
     * This will avoid cache misses that happen while accessing slabp (which
     * is per page memory  reference) to get nodeid. Instead use a global
     * variable to skip the call, which is mostly likely to be present in
     * the cache.
     */
    if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
        return;

    if (ac->avail < ac->limit) {
        STATS_INC_FREEHIT(cachep);
    } else {
        STATS_INC_FREEMISS(cachep);
        cache_flusharray(cachep, ac);---------------------------------嘗試回收空閑對象
    }

    ac_put_obj(cachep, ac, objp);-------------------------------------將對象釋放到本地對象緩沖池ac中
}

 

5. kmalloc分配函數

kmalloc函數基於slab機制,分配的內存大小也是對齊到2^order個字節。

分配的時候是從kmalloc-xxx的slab描述符種分配一個對象。

這些kmalloc-xxx的slab描述符是由create_kmalloc_caches在系統初始換的時候創建的。

PS:下面代碼根據slub進行分析。

5.1 kmalloc slab描述符創建

create_kmalloc_caches的調用路徑是start_kernel-->mm_init-->kmem_cache_init-->create_kmalloc_caches。

再初始化之前,弄明白這三個參數KMALLOC_SHIFT_LOW, KMALLOC_SHIFT_HIGH, KMALLOC_SHIFT_MAX很重要。

#define CONFIG_ARM_L1_CACHE_SHIFT 6----------------------------------------6,對應64B
=================================================
#define L1_CACHE_SHIFT        CONFIG_ARM_L1_CACHE_SHIFT
#define L1_CACHE_BYTES        (1 << L1_CACHE_SHIFT)------------------------即為64B

/*
 * Memory returned by kmalloc() may be used for DMA, so we must make
 * sure that all such allocations are cache aligned. Otherwise,
 * unrelated code may cause parts of the buffer to be read into the
 * cache before the transfer is done, causing old data to be seen by
 * the CPU.
 */
#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES-------------------------------和L1 Cache對齊,即64B對齊

=================================================
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN---------------------------即為64B
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN--------------------------------即為64B
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)------------------------位移量為6,對應64B
=================================================/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH    (PAGE_SHIFT + 1)---------------------------位移量為13,對應8KB大小
#define KMALLOC_SHIFT_MAX    (MAX_ORDER + PAGE_SHIFT)--------------------位移量為23,對應8MB大小

所以:

KMALLOC_MIN_SIZE=64 KMALLOC_SHIFT_LOW=6 KMALLOC_SHIFT_HIGH=13 KMALLOC_SHIFT_MAX=23

對於kmalloc尺寸小於192B從哪個slab描述符中分配緩存,進行了特殊的映射。

/*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
 * of two cache sizes there. The size of larger slabs can be determined using
 * fls.
 */
static s8 size_index[24] = {
    3,    /* 8 */
    4,    /* 16 */
    5,    /* 24 */
    5,    /* 32 */
    6,    /* 40 */
    6,    /* 48 */
    6,    /* 56 */
    6,    /* 64 */
    1,    /* 72 */
    1,    /* 80 */
    1,    /* 88 */
    1,    /* 96 */
    7,    /* 104 */
    7,    /* 112 */
    7,    /* 120 */
    7,    /* 128 */
    2,    /* 136 */
    2,    /* 144 */
    2,    /* 152 */
    2,    /* 160 */
    2,    /* 168 */
    2,    /* 176 */
    2,    /* 184 */
    2    /* 192 */
};

 

size_index的數值對應kmalloc_caches的下標,kmalloc_caches的內容由create_kmalloc_caches創建。

/*
 * Create the kmalloc array. Some of the regular kmalloc arrays
 * may already have been created because they were needed to
 * enable allocations for slab creation.
 */
void __init create_kmalloc_caches(unsigned long flags)
{
    int i;

    /*
     * Patch up the size_index table if we have strange large alignment
     * requirements for the kmalloc array. This is only the case for
     * MIPS it seems. The standard arches will not generate any code here.
     *
     * Largest permitted alignment is 256 bytes due to the way we
     * handle the index determination for the smaller caches.
     *
     * Make sure that nothing crazy happens if someone starts tinkering
     * around with ARCH_KMALLOC_MINALIGN
     */
    BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
        (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));

    for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
        int elem = size_index_elem(i);

        if (elem >= ARRAY_SIZE(size_index))
            break;
        size_index[elem] = KMALLOC_SHIFT_LOW;----------------------------
    }

    if (KMALLOC_MIN_SIZE >= 64) {
        /*
         * The 96 byte size cache is not used if the alignment
         * is 64 byte.
         */
        for (i = 64 + 8; i <= 96; i += 8)
            size_index[size_index_elem(i)] = 7;

    }

    if (KMALLOC_MIN_SIZE >= 128) {
        /*
         * The 192 byte sized cache is not used if the alignment
         * is 128 byte. Redirect kmalloc to use the 256 byte cache
         * instead.
         */
        for (i = 128 + 8; i <= 192; i += 8)
            size_index[size_index_elem(i)] = 8;
    }
    for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {------------------------從order=8開始到order=13,這里創建kmalloc-64/kmalloc-128/kmalloc-256/kmalloc-512/kmalloc-1024/kmalloc-2048/kmalloc-4096/kmalloc-8192 if (!kmalloc_caches[i]) {
            kmalloc_caches[i] = create_kmalloc_cache(NULL,
                            1 << i, flags);
        }

        /*
         * Caches that are not of the two-to-the-power-of size.
         * These have to be created immediately after the
         * earlier power of two caches
         */
        if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)----------------KMALLOC_MIN_SIZE為64,跳過
            kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);

        if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)----------------創建kmalloc-192
            kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
    }

    /* Kmalloc array is now usable */
    slab_state = UP;

    for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
        struct kmem_cache *s = kmalloc_caches[i];
        char *n;

        if (s) {
            n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));--------------修改slab描述符名稱

            BUG_ON(!n);
            s->name = n;
        }
    }
...
}

其中size_index經過重映射之后變成了如下。

所以8B/16B/24B/32B/40B/48B/56B/64B都使用kmalloc-64;

72B/80B/88B/96B/104B/112B/120B/128B都是用kmalloc-128;

136B/144B/152B/160B/168B/176B/184B/192B都使用kmalloc-192。

size_index[0]=6 /*8*/
size_index[1]=6 /*16*/
size_index[2]=6 /*24*/
size_index[3]=6 /*32*/
size_index[4]=6 /*40*/
size_index[5]=6 /*48*/
size_index[6]=6 /*56*/
size_index[7]=6 /*64*/
size_index[8]=7 /*72*/
size_index[9]=7 /*80*/
size_index[10]=7 /*88*/
size_index[11]=7 /*96*/
size_index[12]=7 /*104*/
size_index[13]=7 /*112*/
size_index[14]=7 /*120*/
size_index[15]=7 /*128*/
size_index[16]=2 /*136*/
size_index[17]=2 /*144*/
size_index[18]=2 /*152*/
size_index[19]=2 /*160*/
size_index[20]=2 /*168*/
size_index[21]=2 /*176*/
size_index[22]=2 /*184*/
size_index[23]=2 /*192*/

看看/proc/slabinfo中的最終結果如何?

kmalloc-8192          12     12   8192    4    8 : tunables    0    0    0 : slabdata      3      3      0
kmalloc-4096          61     88   4096    8    8 : tunables    0    0    0 : slabdata     11     11      0
kmalloc-2048          48     48   2048   16    8 : tunables    0    0    0 : slabdata      3      3      0
kmalloc-1024          96     96   1024   16    4 : tunables    0    0    0 : slabdata      6      6      0
kmalloc-512          384    384    512   16    2 : tunables    0    0    0 : slabdata     24     24      0
kmalloc-256          208    208    256   16    1 : tunables    0    0    0 : slabdata     13     13      0
kmalloc-192          441    441    192   21    1 : tunables    0    0    0 : slabdata     21     21      0
kmalloc-128         1280   1280    128   32    1 : tunables    0    0    0 : slabdata     40     40      0
kmalloc-64          4416   4416     64   64    1 : tunables    0    0    0 : slabdata     69     69      0

 

5.2 kmalloc

kmalloc是按字節分配內存的接口,針對不同大小采取了不同的操作。

KMALLOC_MAX_CACHE_SIZE是一個分界線,大於8KB的內存分配需要kmalloc_large進行處理。

另外對於小於等於192B,通過size_index映射到不同kmalloc-xxx slab描述符。

大於192B小於KMALLOC_MAX_CACHE_SIZE,通過fls找到對應的kmalloc_caches索引號。

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
    if (__builtin_constant_p(size)) {
        if (size > KMALLOC_MAX_CACHE_SIZE)---------------------------------大於8KB內存使用kmalloc_large來分配 return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
        if (!(flags & GFP_DMA)) {
            int index = kmalloc_index(size);-------------------------------找到slab描述符 if (!index)
                return ZERO_SIZE_PTR;

            return kmem_cache_alloc_trace(kmalloc_caches[index],-----------調用slab_alloc分配緩存
                    flags, size);
        }
#endif
    }
    return __kmalloc(size, flags);-----------------------------------------另一種情況分支
}

不同分配器分支,這里取slub: void *__kmalloc(size_t size, gfp_t flags)
{
    struct kmem_cache *s;
    void *ret;

    if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))--------------------------再一次檢查8KB這個大小,kmalloc_large分配8KB+緩存 return kmalloc_large(size, flags);

    s = kmalloc_slab(size, flags);----------------------------------------從預分配slab描述符中找到struct kmem_cache。 if (unlikely(ZERO_OR_NULL_PTR(s)))
        return s;

    ret = slab_alloc(s, flags, _RET_IP_);---------------------------------調用slab_alloc分配緩存

    trace_kmalloc(_RET_IP_, ret, size, s->size, flags);

    kasan_kmalloc(s, ret, size);

    return ret;
}

/*
 * Find the kmem_cache structure that serves a given size of
 * allocation
 */
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
    int index;

    if (unlikely(size > KMALLOC_MAX_SIZE)) {
        WARN_ON_ONCE(!(flags & __GFP_NOWARN));
        return NULL;
    }

    if (size <= 192) {
        if (!size)
            return ZERO_SIZE_PTR;

        index = size_index[size_index_elem(size)];------------------小於等於192B大小通過size_index得出slab描述符索引
    } else
        index = fls(size - 1);--------------------------------------fls根據大小計算most-significant位索引,范圍從192B~8KB。

#ifdef CONFIG_ZONE_DMA
    if (unlikely((flags & GFP_DMA)))
        return kmalloc_dma_caches[index];

#endif
    return kmalloc_caches[index];
}

為了提高分配緩存的速度,降低函數調用路徑。關鍵函數進行了__always_inline修飾。

kmem_cache_alloc
    slab_alloc-->
        slab_alloc_node-->
static __always_inline void *slab_alloc(struct kmem_cache *s,
        gfp_t gfpflags, unsigned long addr)
{
    return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}

static __always_inline void *slab_alloc_node(struct kmem_cache *s,
        gfp_t gfpflags, int node, unsigned long addr)

 

6. 創建slab描述符實驗

 

7. slab分配器相關調試接口

7.1 解讀/proc/slabinfo

/proc/slabinfo是slab分配器的統計信息,打開CONFIG_DEBUG_SLAB可以獲取更多信息。

slabinfo - version: 2.1 (statistics) # name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail> : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow> : cpustat <allochit> <allocmiss> <freehit> <freemiss>... kmalloc-4194304        0      0 4194304    1 1024 : tunables    1    1    0 : slabdata      0      0      0 : globalstat       0      0     0    0    0    0    0    0    0 : cpustat      0      0      0      0 kmalloc-2097152        0      0 2097152    1  512 : tunables    1    1    0 : slabdata      0      0      0 : globalstat       0      0     0    0    0    0    0    0    0 : cpustat      0      0      0      0 kmalloc-1048576        0      0 1048576    1  256 : tunables    1    1    0 : slabdata      0      0      0 : globalstat       0      0     0    0    0    0    0    0    0 : cpustat      0      0      0      0 kmalloc-524288         0      0 524288    1  128 : tunables    1    1    0 : slabdata      0      0      0 : globalstat       0      0     0    0    0    0    0    0    0 : cpustat      0      0      0      0 kmalloc-262144         1      1 262144    1   64 : tunables    1    1    0 : slabdata      1      1      0 : globalstat       1      1     1    0    0    0    0    0    0 : cpustat      0      1      0      0 kmalloc-131072         0      0 131072    1   32 : tunables    8    4    0 : slabdata      0      0      0 : globalstat       0      0     0    0    0    0    0    0    0 : cpustat      0      0      0      0 kmalloc-65536          1      1  65536    1   16 : tunables    8    4    0 : slabdata      1      1      0 : globalstat       2      2     2    1    0    0    0    0    0 : cpustat      0      2      1      0 kmalloc-32768          1      1  32768    1    8 : tunables    8    4    0 : slabdata      1      1      0 : globalstat       2      2     2    1    0    0    0    0    0 : cpustat      0      2      1      0 kmalloc-16384          3      3  16384    1    4 : tunables    8    4    0 : slabdata      3      3      0 : globalstat       4      3     3    0    0    0    0    0    0 : cpustat      0      4      1      0 kmalloc-8192           7      7   8192    1    2 : tunables    8    4    0 : slabdata      7      7      0 : globalstat       9      8     8    1    0    0    0    0    0 : cpustat      0      9      2      0 kmalloc-4096          29     78   4096    1    1 : tunables   24   12    8 : slabdata     29     78      0 : globalstat     105    105   105   10    0   20    0    0    0 : cpustat    878    139    974     34 kmalloc-2048          18     18   4096    1    1 : tunables   24   12    8 : slabdata     18     18      0 : globalstat      19     18    19    1    0    0    0    0    0 : cpustat      4     19      5      0 kmalloc-1024         135    135   4096    1    1 : tunables   24   12    8 : slabdata    135    135      0 : globalstat     135    135   135    0    0    0    0    0    0 : cpustat     42    135     43      0 kmalloc-512          425    425   4096    1    1 : tunables   24   12    8 : slabdata    425    425      0 : globalstat     425    425   425    0    0    0    0    0    0 : cpustat    137    425    137      0 kmalloc-256          112    112   4096    1    1 : tunables   24   12    8 : slabdata    112    112      0 : globalstat     121    119   120    8    0    0    0    0    0 : cpustat    608    121    619      0 kmalloc-192          248    248   4096    1    1 : tunables   24   12    8 : slabdata    248    248      0 : globalstat     248    248   248    0    0    0    0    0    0 : cpustat     11    248     11      0 kmalloc-128          977    977   4096    1    1 : tunables   24   12    8 : slabdata    977    977      0 : globalstat     981    977   981    4    0    0    0    0    0 : cpustat    437    981    443      0 kmalloc-64         26746  26838     64   63    1 : tunables   32   16    8 : slabdata    426    426     48 : globalstat   26845  26838   426    0    0    0    0    0    0 : cpustat  27112   2168   2589     23 kmem_cache 142    142   4096    1    2 : tunables   24   12    8 : slabdata    142    142      0 : globalstat     142    142   142    0    0    0    0    0    0 : cpustat      0    142      0      0

 

8 kmem相關Tracepoint

kmem跟蹤事件主要跟蹤內核slab和page的分配和釋放行為,主要可以分為5大部分。

這些events的詳細解釋參考:Documentation/trace/events-kmem.txt

8.1 Slab allocation of small objects of unknown type (kmalloc)

那些函數調用?Trace什么樣子?有什么用途?

kfree---------------------------kfree
kmalloc-------------------------kmalloc/__kmalloc等類kmalloc函數
kmalloc_node--------------------kmalloc_node/__kmalloc_node等類kmalloc_node函數

kmalloc_node和kmalloc的區別是多了個node參數,對NUMA系統來說需要node進行區分。在非NUMA系統,意義不大。

相關Log如下,從中可以看出調用者call_site,分配內存地址ptr,請求分配大小bytes_req,實際分配大小bytes_alloc,分配掩碼gfp_flags。

bytes_alloc>=bytes_req,並且進行了2^order對齊;但是call_site是個地址,可讀性較差

# tracer: nop
#
# entries-in-buffer/entries-written: 14/14   #P:4
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
              sh-647   [001] ....   843.814154: kmalloc: call_site=c012fbec ptr=ee042600 bytes_req=200 bytes_alloc=256 gfp_flags=GFP_KERNEL|GFP_ZERO
              sh-647   [001] ....   843.815146: kmalloc: call_site=c0175e4c ptr=eeab2580 bytes_req=104 bytes_alloc=128 gfp_flags=GFP_KERNEL
              sh-647   [001] ....   843.815185: kmalloc: call_site=c0174818 ptr=ee042a00 bytes_req=224 bytes_alloc=256 gfp_flags=GFP_KERNEL
              sh-647   [001] ....   843.816017: kfree: call_site=c0176744 ptr=  (null)
              sh-647   [001] ....   843.816029: kfree: call_site=c017674c ptr=ee042a00
              sh-647   [001] ....   843.816129: kfree: call_site=c0175eb4 ptr=eeab2580
              sh-647   [001] ....   843.816143: kfree: call_site=c012ebdc ptr=ee042600
              sh-647   [001] ....   843.816149: kfree: call_site=c01300e0 ptr=  (null)
              sh-647   [001] ....   843.816776: kmalloc: call_site=c0184928 ptr=ee9994c0 bytes_req=12 bytes_alloc=64 gfp_flags=GFP_KERNEL
              sh-647   [001] ....   843.816868: kfree: call_site=c014ff80 ptr=ee9994c0
...

 

對call_site進行一下簡單的改造,使其可以直接打印字符串:

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
old mode 100644
new mode 100755
index 4ad10ba..5c404bb
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -34,7 +34,7 @@ DECLARE_EVENT_CLASS(kmem_alloc,
                __entry->gfp_flags      = gfp_flags;
        ),
 
-       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+       TP_printk("call_site=%pf ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
                __entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
@@ -87,7 +87,7 @@ DECLARE_EVENT_CLASS(kmem_alloc_node,
                __entry->node           = node;
        ),
 
-       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+       TP_printk("call_site=%pf ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
                __entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
@@ -130,7 +130,7 @@ DECLARE_EVENT_CLASS(kmem_free,
                __entry->ptr            = ptr;
        ),
 
-       TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+       TP_printk("call_site=%pf ptr=%p", __entry->call_site, __entry->ptr)
 );
 
 DEFINE_EVENT(kmem_free, kfree,

 

修改后的結果如下,ptr是kmalloc和kfree的聯系樞紐,兩者必須成對,不然就可能存在內存泄露。

同時可以看到同一個ptr的kmalloc和kfree的call_site,對此內存的申請釋放路徑就有個大概的了解。

# tracer: nop
#
# entries-in-buffer/entries-written: 15/15   #P:4
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
              sh-640   [000] ....    97.451247: kmalloc: call_site=tracepoint_probe_register ptr=ee3ef400 bytes_req=24 bytes_alloc=64 gfp_flags=GFP_KERNEL
              sh-646   [001] ....   102.511304: kmalloc: call_site=do_execveat_common ptr=ee0dd400 bytes_req=200 bytes_alloc=256 gfp_flags=GFP_KERNEL|GFP_ZERO
              sh-646   [001] ....   102.513041: kmalloc: call_site=load_elf_binary ptr=eead9880 bytes_req=104 bytes_alloc=128 gfp_flags=GFP_KERNEL
              sh-646   [001] ....   102.513149: kmalloc: call_site=load_elf_phdrs ptr=ee0dd000 bytes_req=224 bytes_alloc=256 gfp_flags=GFP_KERNEL
              sh-646   [001] ....   102.513831: kfree: call_site=load_elf_binary ptr=  (null)
              sh-646   [001] ....   102.513878: kfree: call_site=load_elf_binary ptr=ee0dd000
              sh-646   [001] ....   102.513981: kfree: call_site=load_elf_binary ptr=eead9880
              sh-646   [001] ....   102.513996: kfree: call_site=free_bprm ptr=ee0dd400
              sh-646   [001] ....   102.514002: kfree: call_site=do_execveat_common ptr=  (null)
              sh-646   [001] ....   102.514629: kmalloc: call_site=proc_self_follow_link ptr=ed4aaf80 bytes_req=12 bytes_alloc=64 gfp_flags=GFP_KERNEL
              sh-646   [001] ....   102.514721: kfree: call_site=kfree_put_link ptr=ed4aaf80
...

 

所以基於kmalloc/kmalloc_node/kfree這幾個events,可以判斷一個進程分配了多少內存;在運行過程中是否存在內存泄露,即kmalloc沒有對應的kfree。

 

8.2 Slab allocation of small objects of known type

 kmem_cache_alloc/kmem_cache_alloc_node/kmem_cache_free基本上和類kmalloc函數一一對應,兩者的使用和表達的含義基本一致。只是對應的分配函數不一樣。

kmem_cache_alloc------------------------kmem_cache_alloc
kmem_cache_alloc_node-------------------kmem_cache_alloc_node
kmem_cache_free-------------------------kmem_cache_free

 kmem_cache_alloc類事件的用途和kmalloc類基本差不多,可以通過call_site找到調用者;可以通過kmem_cache_alloc和kmem_cache_free是否成對出現而判斷內存泄露問題。

實例如下:

# tracer: nop
#
# entries-in-buffer/entries-written: 80/80   #P:4
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |              sh-640   [000] ....   598.446568: kmem_cache_alloc: call_site=getname_flags ptr=ed4af000 bytes_req=4096 bytes_alloc=4096 gfp_flags=GFP_KERNEL
              sh-640   [000] ....   598.446611: kmem_cache_alloc: call_site=get_empty_filp ptr=eeb17ac0 bytes_req=192 bytes_alloc=192 gfp_flags=GFP_KERNEL|GFP_ZERO
              sh-640   [000] ....   598.446673: kmem_cache_alloc: call_site=__d_alloc ptr=ee445660 bytes_req=136 bytes_alloc=136 gfp_flags=GFP_KERNEL
              sh-640   [000] ....   598.446751: kmem_cache_free: call_site=putname ptr=ed4af000
              sh-640   [000] ....   598.446808: kmem_cache_alloc: call_site=SyS_getcwd ptr=ed4af000 bytes_req=4096 bytes_alloc=4096 gfp_flags=GFP_KERNEL
              sh-640   [000] ....   598.446839: kmem_cache_free: call_site=SyS_getcwd ptr=ed4af000
              sh-640   [000] ....   601.702831: kmem_cache_alloc: call_site=getname_flags ptr=ed4af000 bytes_req=4096 bytes_alloc=4096 gfp_flags=GFP_KERNEL
              sh-640   [000] ....   601.702884: kmem_cache_alloc: call_site=get_empty_filp ptr=eeb17ac0 bytes_req=192 bytes_alloc=192 gfp_flags=GFP_KERNEL|GFP_ZERO
              sh-640   [000] ....   601.703028: kmem_cache_free: call_site=putname ptr=ed4af000
              sh-640   [000] ....   601.703560: kmem_cache_alloc: call_site=copy_process.part.12 ptr=ee9af080 bytes_req=1280 bytes_alloc=1280 gfp_flags=GFP_KERNEL
...

 

8.3 Page allocation

 

mm_page_alloc
mm_page_alloc_zone_locked mm_page_free mm_page_free_batched

 

 

8.4 Per-CPU Allocator Activity

 

mm_page_alloc_zone_locked
mm_page_pcpu_drain

 

 

8.5 External Fragmentation

 

mm_page_alloc_extfrag

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM