內存管理中關於Movable的理解


內核中的管理區

內核中定義了如下一些管理區zone:

enum zone_type {
#ifdef CONFIG_ZONE_DMA
    /*
     * ZONE_DMA is used when there are devices that are not able
     * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
     * carve out the portion of memory that is needed for these devices.
     * The range is arch specific.
     *
     * Some examples
     *
     * Architecture     Limit
     * ---------------------------
     * parisc, ia64, sparc  <4G
     * s390         <2G
     * arm          Various
     * alpha        Unlimited or 0-16MB.
     *
     * i386, x86_64 and multiple other arches
     *          <16M.
     */
    ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
    /*
     * x86_64 needs two ZONE_DMAs because it supports devices that are
     * only able to do DMA to the lower 16M but also 32 bit devices that
     * can only do DMA areas below 4G.
     */
    ZONE_DMA32,
#endif
    /*
     * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
     * performed on pages in ZONE_NORMAL if the DMA devices support
     * transfers to all addressable memory.
     */
    ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
    /*
     * A memory area that is only addressable by the kernel through
     * mapping portions into its own address space. This is for example
     * used by i386 to allow the kernel to address the memory beyond
     * 900MB. The kernel will set up special mappings (page
     * table entries on i386) for each page that the kernel needs to
     * access.
     */
    ZONE_HIGHMEM,
#endif
    ZONE_MOVABLE,
    __MAX_NR_ZONES
};

  • ZONE_DMA
    該管理區是一些設備無法使用DMA訪問所有地址的范圍,因此特意划分出來的一塊內存,專門用於特殊DMA訪問分配使用的區域。比如x86架構此區域為0-16M
  • ZONE_NORMAL
    NORMAL區域是直接映射區。
  • ZONE_HIGHMEM
    高端內存管理區,申請的內存,需要內核進行map后才能訪問。對於64bit Arch架構,我們一般不需要高端內存區,因為地址空間足夠映射所有的物理內存。
  • ZONE_MOVABLE
    這個區域是一個特殊的存在,主要是為了支持memory hotplug功能,所以MOVABLE表示可移除,其實它也表示可遷移。

簡單來說,可遷移的頁面不一定都在ZONE_MOVABLE中,但是ZONE_MOVABLE中的也頁面必須都是可遷移的,我們通過查看/proc/pagetypeinfo來看下實例:

xie:/proc # cat pagetypeinfo                                                 
Page block order: 10
Pages per block:  1024

Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10 
Node    0, zone      DMA, type    Unmovable     76     50     24     20     27     25     19      3      1      2      0 
Node    0, zone      DMA, type      Movable    117     35     28    172    281     93     49     21      7      4      4 
Node    0, zone      DMA, type  Reclaimable      0      3      1      0      0      0      0      1      0      1      0 
Node    0, zone      DMA, type          CMA   3380   1798    856    386    152     55     21      8      4      0      0 
Node    0, zone      DMA, type   HighAtomic      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type    Unmovable    521    654    531    286    132     52     15      2      1      4      0 
Node    0, zone   Normal, type      Movable      1      8     21     21      1      1      5      3      1      0      0 
Node    0, zone   Normal, type  Reclaimable     18     24      1      1      0      0      1      0      1      0      0 
Node    0, zone   Normal, type          CMA      9      0      1      6      2      0      1      0      0      0      0 
Node    0, zone   Normal, type   HighAtomic      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type      Isolate      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  Movable, type    Unmovable      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  Movable, type      Movable    963    649    188     48     24    112     49     21      8      3     50 
Node    0, zone  Movable, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  Movable, type          CMA      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  Movable, type   HighAtomic      0      0      0      0      0      0      0      0      0      0      0 
Node    0, zone  Movable, type      Isolate      0      0      0      0      0      0      0      0      0      0      0 

Number of blocks type     Unmovable      Movable  Reclaimable          CMA   HighAtomic      Isolate 
Node 0, zone      DMA          123          310           18           61            0            0 
Node 0, zone   Normal          406          310           43            9            0            0 
Node 0, zone  Movable            0          256            0            0            0            0 

Number of mixed blocks    Unmovable      Movable  Reclaimable          CMA   HighAtomic      Isolate 
Node 0, zone      DMA            0           61            0            0            0            0 
Node 0, zone   Normal            0           11            3            0            0            0 
Node 0, zone  Movable            0            0            0            0            0            0 

可以看到在Movable Zone中不存在Unmovable類型的頁面,只有Movable類型的頁面。

管理區ZONE_MOVABLE

這個管理區,主要是和memory hotplug功能有關,為什么要設計內存熱插拔功能,主要是為了如下兩點考慮:
1.邏輯內存熱插拔,對於虛擬機的支持,對於虛擬機按照需求來分配可用內存
2.物理內存熱插拔,對於NUMA服務器的支持,不需要的內存就設置為offline,以降低功耗
3.優化內存碎片問題

這個管理區域存放的page都是可遷移的,只能被帶有__GFP_HIGHMEM和__GFP_MOVABLE標志的內存申請所使用,比如:

#define GFP_HIGHUSER_MOVABLE    (GFP_HIGHUSER | __GFP_MOVABLE)

#define GFP_USER    (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_HIGHUSER    (GFP_USER | __GFP_HIGHMEM)

主要注意的是不要把分配標志__GFP_MOVABLE和管理區ZONE_MOVABLE混淆,兩者並不是對應的關系。

  • __GFP_MOVABLE表示的是一種分配頁面屬性,表示頁面可遷移,即使不在ZONE_MOVABLE管理區,有些頁面也是可以遷移的,比如cache;
  • ZONE_MOVABLE表示的是管理區,和內存的熱插拔有關,當然其中的頁面必須要可遷移才能支持熱插拔。

分配標志__GFP_MOVABLE

#define __GFP_DMA   ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM   ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE   ((__force gfp_t)___GFP_MOVABLE)  /* Page is movable */
#define GFP_ZONEMASK    (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)

這幾個分配標志被稱為Zone modifiers,他們用來標識優先從哪個zone分配內存。

bit       result
=================
0x0    => NORMAL
0x1    => DMA or NORMAL
0x2    => HIGHMEM or NORMAL
0x3    => BAD (DMA+HIGHMEM)
0x4    => DMA32 or DMA or NORMAL
0x5    => BAD (DMA+DMA32)
0x6    => BAD (HIGHMEM+DMA32)
0x7    => BAD (HIGHMEM+DMA32+DMA)
0x8    => NORMAL (MOVABLE+0)
0x9    => DMA or NORMAL (MOVABLE+DMA)
0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
0xb    => BAD (MOVABLE+HIGHMEM+DMA)
0xc    => DMA32 (MOVABLE+DMA32)
0xd    => BAD (MOVABLE+DMA32+DMA)
0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)

一共有4個bit用來表示組合類型,其中低3個bit只能選擇一個(__GFP_DMA/__GFP_HIGHMEM/__GFP_DMA32),而__GFP_MOVABLE可以和其他三種的任何一個組合使用,因此一共有16中組合,根據各種類型進行一個偏移存放到一個long類型table中。

GFP_ZONE_TABLE:

|BAD|BAD|BAD|DMA32|BAD|MOVABLE|......|NORMAL|

這些結果會根據上面的bit組合值做一個偏移,存放到ZONE TABLE中,從而可以根據組合快速定位要使用的ZONE管理區。由上可見,__GFP_MOVABLE代表的是一種分配策略,並不是和ZONE_MOVABLE匹配的,上一節也做了介紹,必須是(__GFP_HIGHMEM和__GFP_MOVABLE)同時置位才會從ZONE_MOVABLE管理區去分配內存。

The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA

因此我們分配內存時並不一定就會按照傳入的FLAG來進行分配,如果對應zone中沒有符合要求的內存,那么會依次進行fallback查找符合要求的內存。

如何使能ZONE_MOVABLE

- For all memory hotplug
    Memory model -> Sparse Memory  (CONFIG_SPARSEMEM)
    Allow for memory hot-add       (CONFIG_MEMORY_HOTPLUG)

- To enable memory removal, the followings are also necessary
    Allow for memory hot remove    (CONFIG_MEMORY_HOTREMOVE)
    Page Migration                 (CONFIG_MIGRATION)

- For ACPI memory hotplug, the followings are also necessary
    Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY)
    This option can be kernel module.

- As a related configuration, if your box has a feature of NUMA-node hotplug
  via ACPI, then this option is necessary too.
    ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
    (CONFIG_ACPI_CONTAINER).
    This option can be kernel module too.

1) When kernelcore=YYYY boot option is used,
   Size of memory not for movable pages (not for offline) is YYYY.
   Size of memory for movable pages (for offline) is TOTAL-YYYY.

2) When movablecore=ZZZZ boot option is used,
   Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
   Size of memory for movable pages (for offline) is ZZZZ.

內核中定義了sysfs節點用來控制內存的熱插拔:

% echo online > /sys/devices/system/memory/memoryXXX/state

使能內存。

% echo online_movable > /sys/devices/system/memory/memoryXXX/state

切換內存管理區為ZONE_MOVABLE。

% echo online_kernel > /sys/devices/system/memory/memoryXXX/state

切換內存管理區為ZONE_NORMAL。

如何決定MOVABLE_ZONE的大小

我們先來看下在memory zone初始化時的處理:
對於NUMA使能的系統處理是這樣的:

zone_sizes_init->free_area_init_nodes->find_zone_movable_pfns_for_nodes:
/*
 * If movable_node is specified, ignore kernelcore and movablecore
 * options.
 */
if (movable_node_is_enabled()) {
    for_each_memblock(memory, r) {
        if (!memblock_is_hotpluggable(r))
            continue;

        nid = r->nid;

        usable_startpfn = PFN_DOWN(r->base);
        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
            min(usable_startpfn, zone_movable_pfn[nid]) :
            usable_startpfn;
    }

    goto out2;
}

當我們在dts設備樹中配置對應的property時就會配置對應的memblock flag:

int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
                     int depth, void *data)
{
   bool hotpluggable;
   hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
   while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
     u64 base, size;

     base = dt_mem_next_cell(dt_root_addr_cells, &reg);
     size = dt_mem_next_cell(dt_root_size_cells, &reg);

     if (size == 0)
         continue;
     pr_debug(" - %llx ,  %llx\n", (unsigned long long)base,
         (unsigned long long)size);

     early_init_dt_add_memory_arch(base, size);

     if (!hotpluggable)
         continue;

     if (early_init_dt_mark_hotplug_memory_arch(base, size))
         pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
             base, base + size);
    }

}

int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
{
    return memblock_mark_hotplug(base, size);
}

int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
    return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
}  

from: https://blog.csdn.net/rikeyone/article/details/86498298


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM