在內核中分配內存,最后要通過伙伴系統接口進行實際物理頁面的分配,一個重要的接口便是alloc_page.本文介紹下alloc_page的主要流程,各個部分的執行。主要包含正常分配流程,當頁面不足的時候的處理方式。先定位到核心調用
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
order是分配頁面的階,即2的指數個頁面
#define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order)
nid指定了從哪個NUMA節點分配頁面,如果沒有指定節點,則默認從當前節點分配
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { /* Unknown node is current node */ /*如果沒有指定node ,則從當前node分配*/ if (nid < 0) nid = numa_node_id(); return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); }
zonelist是一組zone的列表,有兩種,局部的和全局的。局部zonelist只包含本節點的zone,全局的包含所有節點的zone。下篇文章會詳細介紹這些數據結構
static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); }
到了__alloc_pages_nodemask就進入了比較正式的流程了,撥開來講,本函數主要包含兩步:
1、直接分配
2、分配失敗選擇另一種方式即slowpath繼續處理.
首次嘗試分配是調用了static struct page *get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,struct zonelist *zonelist, int high_zoneidx, int alloc_flags,struct zone *preferred_zone, int migratetype),核心機制就是遍歷zonelist上的zone,找到一個page.函數代碼倒是沒什么可說的,比較容易理解,這里主要看下涉及到的幾個機制。該函數主要實現功能:1、在zonelist中找到一個合適的zone 2、從zone中分配頁面。前者由一個循環體完成,后者由static inline struct page *buffered_rmqueue(struct zone *preferred_zone,struct zone *zone, int order, gfp_t gfp_flags,int migratetype)完成。
在選定zone的階段,在正常情況下需要進行一系列的驗證,保證當前zone有足夠的可用頁面供分配。那么什么是非正常情況呢?即使攜帶ALLOC_NO_WATERMARKS標識的,所以這里就分為兩種情況。這里涉及到一個watermark,俗稱分配水位,水位有三種
- #define ALLOC_WMARK_MIN WMARK_MIN
- #define ALLOC_WMARK_LOW WMARK_LOW
- #define ALLOC_WMARK_HIGH WMARK_HIGH
在分配之前一般會指定滿足那個水位才允許分配,或者不管水位直接分配,這就對應ALLOC_NO_WATERMARKS標識。在zone結構中,有vm_stat字段,是一個數組,記錄各個狀態的頁面的數量,其中就包含空閑頁面,對應NR_FREE_PAGES,攜帶watermark標識的分配,需要驗證空閑頁面是否大於對應的水位,只有在大於水位了才允許分配,否則需要根據情況對頁面進行回收reclaim,如果無法回收或者回收后仍然不滿足條件,則直接返回了。在一些急迫的事務中,可以指定ALLOC_NO_WATERMARKS,這樣會不會對水位進行驗證,直接調用buffered_rmqueue分配頁面。
buffered_rmqueue並不直接從伙伴系統分配,為了加速分配流程,每個CPU也會維護頁框高速緩存,通過per_cpu_pageset管理
struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; #endif #ifdef CONFIG_SMP s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif };
其中pcp維護了各種性質的頁面鏈表,性質基本是根據可移動性來決定的。
struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ /* Lists of pages, one per migrate type stored on the pcp-lists */ /*鏈表數組,每個遷移類型維護一個數組*/ /*MIGRATE_UNMOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_PCPTYPES, // the number of types on the pcp lists MIGRATE_RESERVE*/ struct list_head lists[MIGRATE_PCPTYPES]; };
count鏈表中所有頁面的數量,high和清空相關,而batch是當緩存不足時,每次從伙伴系統申請多少頁面填充進來。
核心分配邏輯如下:
if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; local_irq_save(flags); /*頁框高速緩存*/ pcp = &this_cpu_ptr(zone->pageset)->pcp; /*獲取緩存鏈表*/ list = &pcp->lists[migratetype]; /*如果鏈表為空*/ if (list_empty(list)) { /*嘗試從伙伴系統填充鏈表*/ pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, migratetype, cold); /*如果依然為空,則失敗*/ if (unlikely(list_empty(list))) goto failed; } /*list是雙鏈表,如果cold為真就從表尾部分配,否則從表頭分配*/ if (cold) page = list_entry(list->prev, struct page, lru); else page = list_entry(list->next, struct page, lru); /*頁面從鏈表刪除*/ list_del(&page->lru); pcp->count--; }
slow alloc path 見__alloc_pages_slowpath函數(page_alloc.c)
static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; bool deferred_compaction = false; bool contended_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to * reclaim >= MAX_ORDER areas which will never succeed. Callers may * be using allocators in order of preference for an area that is * too large. */ if (order >= MAX_ORDER) { WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim * using a larger set of nodes after it has established that the * allowed per node queues are empty and that nodes are * over allocated. */ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; /*到這一步如果允許啟用kswapd線程,則喚醒所有的kswapd*/ restart: if (!(gfp_mask & __GFP_NO_KSWAPD)) wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according * to how we want to proceed. */ alloc_flags = gfp_to_alloc_flags(gfp_mask); /* * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ /*如果沒喲沒有指定cpuset,則選取最優的zonelist*/ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone); rebalance: /* This is the last chance, in general, before the goto nopage. */ /*嘗試分配頁面*/ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) goto got_pg; /*如果允許不管區域的watermark*/ /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { /* * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds * the allocation is high priority and these type of * allocations are system rather than user orientated */ zonelist = node_zonelist(numa_node_id(), gfp_mask); /*高優先級的內存分配,遍歷整個zonelist*/ page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); if (page) { goto got_pg; } } /* Atomic allocations - we can't balance anything */ if (!wait) goto nopage; /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; /* * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; sync_migration = true; /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller * requested a movable allocation that does not heavily disrupt the * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ /*如果壓縮后仍然沒喲u可用的,就對頁面進行回收*/ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, migratetype, &did_some_progress); if (page) goto got_pg; /* * If we failed to make any progress reclaiming, then we are * running out of options and have to consider going OOM */ if (!did_some_progress) { if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; /* Coredumps can quickly deplete all memory reserves */ if ((current->flags & PF_DUMPCORE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); if (page) goto got_pg; if (!(gfp_mask & __GFP_NOFAIL)) { /* * The oom killer is not called for high-order * allocations that may fail, so if no progress * is being made, there are no other options and * retrying is unlikely to help. */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto nopage; /* * The oom killer is not called for lowmem * allocations to prevent needlessly killing * innocent tasks. */ if (high_zoneidx < ZONE_NORMAL) goto nopage; } goto restart; } } /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, did_some_progress, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; } else { /* * High-order allocations do not necessarily loop after * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; } nopage: warn_alloc_failed(gfp_mask, order, NULL); return page; got_pg: if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); return page; }
該函數主要是在首次分配失敗的情況下,采取的補救解決方案。由於首次分配的失敗,要么是緩存不足,要么是真沒有剩余的空閑頁面了。那么我們還能怎么辦?大家可能都能想到,回收內存唄!!不錯,回收內存的確是當前最主要的解決思路,看下代碼,在__alloc_pages_slowpath函數中,首次的調用便是wake_all_kswapd,kswapd都曉得伐,負責物理頁面的換出的,定期執行以保證物理頁面的實時可用性,但是畢竟是定期執行,不能保證任何時刻都有充足的內存使用,所以這里嘗試喚醒所有的kswapd守護進程進行物理頁面的回收。而物理頁面的回收涉及了更多的東西,本文不打算深入,后續有空在單獨分析。到這里已經喚醒了后台的回收線程,Ok,重新嘗試下分配,繼續調用get_page_from_freelist,不過此時着重設置了一個參數,alloc_flags & ~ALLOC_NO_WATERMARKS即本次嘗試一定要符合水位的情況,不要求特殊處理。這樣如果分配得到頁面,就大功告成。如果還是沒有分配到,好吧,降低要求,允許忽略水位限制,調用__alloc_pages_high_priority進行分配。到這里,如果還沒有分配到怎么辦?判斷wait標識,如果允許等待,則接着往下走;否則返回分配失敗吧!沒辦法
接下來既然回收頁面也不行,那么我不換出頁面,把內存中的頁面進行適當壓縮總可以吧!!恩~總算中和了雙方,調用了__alloc_pages_direct_compact,額,壓縮頁面是最后一個解決方案了,如果在不行可真是山窮水盡了,這樣里OOM就不遠了……
該函數主要分兩步:
1、壓縮區域內的頁面try_to_compact_pages
2、嘗試分配頁面 get_page_from_freelist
分配頁面前面已經介紹,這里就不用在多說了,重點看try_to_compact_pages,該函數會按照zonelist中的順序對各個zone進行壓縮,核心邏輯看下
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { int status; status = compact_zone_order(zone, order, gfp_mask, sync, contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, alloc_flags)) break; }
主要是compact_zone_order
static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, bool sync, bool *contended) { unsigned long ret; /*壓縮控制器*/ struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); /*執行壓縮*/ ret = compact_zone(zone, &cc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); *contended = cc.contended; return ret; }
這里有一個結構,姑且稱之為壓縮控制器,記錄壓縮過程中需要的一些參數,設置好后又調用了compact_zone,該函數不在列舉,感興趣的參考源代碼compaction.c,在該函數中,首先調用了compaction_suitable檢查下當前zone是否有壓縮的潛質,即通過壓縮是否可以達到要求,如果可以就執行壓縮,否則,呵呵……忽略吧!該函數返回三種值:
COMPACT_SKIPPED - If there are too few free pages for compaction
COMPACT_PARTIAL - If the allocation would succeed without compaction
COMPACT_CONTINUE - If compaction should run now*/
解釋的比較明確,我就不多說了,唯一的一點是COMPACT_PARTIAL既然表示在不壓縮的情況下也可以分配成功,為何不直接返回成功,然后嘗試分配呢?還要走到下面的流程,真實捉急……根據上面的解釋可能大家也都明白了,這里壓縮並不是把幾個頁面壓縮成一個頁面,本質還是整理碎片,把可移動的頁面盡量安排在一處,騰出來比較大的連續空間,這樣增加滿足需求的可能性。接下來就該執行壓縮了,在此之前設置了壓縮控制器的一些參數,所以有必要看下該結構
struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated */ bool finished_update_migrate; int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool contended; /* True if a lock was contended */ };
如果看字段中的注釋有點蒙,建議看下結構之前的注釋
/* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */
英語不行,簡要翻譯下。compact_control用於在內存壓縮過程中追蹤正在被移動的頁面和被移動的目的頁面,free_pfn起始於zone的末尾,而migrate_pfn起始於zone的起始位置。在壓縮過程中,可移動的頁面被移動到zone的尾部。大致就這個意思,說白了,就是free_pfn從后往前掃描尋找空閑頁面,migrate_pfn從前往后掃描,尋找可移動的頁面,當free_pfn <= migrate_pfn,壓縮就結束了!
在執行壓縮之前,對壓縮控制器參數的調整
cc->migrate_pfn = zone->compact_cached_migrate_pfn; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn = cc->migrate_pfn; }
在設置好壓縮控制器之后,就調用了migrate_prep_local函數,該函數中主要調用lru_add_drain把當前LRU緩存移動到LRU鏈表。下面的一個while循環時壓縮的主體,我么看下源碼
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: continue; case ISOLATE_SUCCESS: ; } nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); /* Release isolated pages not migrated */ if (err) { putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; if (err == -ENOMEM) { ret = COMPACT_PARTIAL; goto out; } } }
循環的判斷條件函數是compact_finished,主體就是判斷壓縮是否完成,主要包含兩部分
1、是否有fatal信號處於pending狀態,如果有則立刻退返回COMPACT_PARTIAL,否則轉入2。
2、檢查壓縮流程是否走完,即cc->free_pfn <= cc->migrate_pfn是否滿足,如果滿足則返回COMPACT_COMPLETE,否則轉入3。
3、cc->order == -1,返回COMPACT_CONTINUE表示繼續壓縮,否則轉入4
4、檢查zone的的freepage是否滿足low 的watermark,如果不滿足則返回COMPACT_CONTINUE,否則轉入5
5、從cc->order開始,依次檢查zone->free_area數組,對各個長度的連續頁面集合中,檢查符合cc->migratetype的鏈表是否為空,如果不為空,返回COMPACT_PARTIAL,否則轉入6
6、返回COMPACT_CONTINUE
循環體內部調用isolate_migratepages,該函數是壓縮的主體函數,其中調用了isolate_migratepages_range,該函數實現把特定的isolate開,即從LRU鏈表中摘除,添加到cc->migratepages鏈表,而下面執行移動的是migrate_pages,該函數實現了吧剛才隔離開的頁面,移動到新的地方,核心即使就是對當前撤銷映射,然后對新的頁面建立映射。接着就調用update_nr_listpages對控制器做了更新,但是如果出錯了就會調用putback_movable_pages把隔離開的頁面重新添加回去,保證系統正常運行。
壓縮函數返回后,會再次嘗試獲取頁面,如果獲取到了,OK,一切照舊;如果沒獲取到,那也沒辦法,向上峰報告吧,該做什么做什么……
以馬內利
參考:linux3.10.1源碼
