ARM64 __create_page_tables分析


內核版本:Linux-4.17
平台:
Qemu + virt (cortex-a53)
4GB
物理內存地址空間:0x40000000~0x13fffffff
 
參考:
 
前提:
CONFIG_ARM64_PAGE_SHIFT=12
CONFIG_ARM64_VA_BITS=48
CONFIG_ARM64_PA_BITS=48
CONFIG_PGTABLE_LEVELS=4
2^32 = 4GB
2^48 = 256TB
2^47 = 128TB
 
分析
1 /*
2  * Setup the initial page tables. We only setup the barest amount which is
3  * required to get the kernel running. The following sections are required:
4  *   - identity mapping to enable the MMU (low address, TTBR0)
5  *   - first few MB of the kernel linear mapping to jump to once the MMU has
6  *     been enabled
7  */
8 __create_page_tables:
9     mov    x28, lr
從注釋看,這里會建立兩種section,分別完成identity mapping和kernel image mapping。
1     /*
2      * Invalidate the idmap and swapper page tables to avoid potential
3      * dirty cache lines being evicted.
4      */
5     adrp    x0, idmap_pg_dir
6     adrp    x1, swapper_pg_end
7     sub    x1, x1, x0
8     bl    __inval_dcache_area
這里將(idmap_pg_dir, swapper_pg_end)這段物理地址范圍對應的dcache進行invalidate。這里的idmap_pg_dir和swapper_pg_end是在vmlinux.lds.S中設置的:
 1     . = ALIGN(PAGE_SIZE);
 2     idmap_pg_dir = .;
 3     . += IDMAP_DIR_SIZE;
 4 
 5 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 6     tramp_pg_dir = .;
 7     . += PAGE_SIZE;
 8 #endif
 9 
10     swapper_pg_dir = .;
11     . += SWAPPER_DIR_SIZE;
12     swapper_pg_end = .;

其中IDMAP_DIR_SIZE定義如下:

#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
#define IDMAP_PGTABLE_LEVELS (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT) - 1)
#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS)
這里的CONFIG_ARM64_PA_BITS配置的是48. 這里的含義是,計算采用section mapping的話,需要幾個頁來存放table。
 
上面ARM64_HW_PGTABLE_LEVELS很關鍵,根據配置的物理地址線的寬度計算需要的level個數:
#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
乍一看可以不太好理解,從注釋可以知道,這是簡化后的形式,完整的計算公式是:
 * ((((va_bits) - PAGE_SHIFT) + (PAGE_SHIFT - 3) - 1) / (PAGE_SHIFT - 3))
結合vmlinux.lds,上面的公式就是: ((48-12)+(12-3)-1) / (12-3) = (36+9-1)/9 = 44/9 = 4
 
理解它需要仔細觀察一下ARM64上不同的granule size對應的虛擬地址的結構:
4KB:

 

16KB:

 

64KB:

 

可以發現如下規律:
每一種granule size的各個level table index占用的位數都相同,並且都比block offset少3個bit,而這里的block offset就是12。所以IDMAP_DIR_SIZE是3個page的大小,也就是12KB。
 
SWAPPER_DIR_SIZE的稍微麻煩,表示存放映射內核鏡像需要的table需要占用幾個頁,如果不開啟KASLR,並且對於section mapping的話,SWAPPER_PGTABLE_LEVELS的值是(CONFIG_PGTABLE_LEVELS - 1),也就是3.
 
#define SWAPPER_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR + TEXT_OFFSET, _end)) 這里(KIMAGE_VADDR + TEXT_OFFSET)是內核的的虛擬起始地址, _end是虛擬結束地址
可以將計算過程單獨提取出來,看看計算結果:
 1 #include <stdio.h>
 2 
 3 #define CONFIG_PGTABLE_LEVELS 4
 4 #define CONFIG_ARM64_PAGE_SHIFT 12
 5 
 6 #define PAGE_SHIFT        CONFIG_ARM64_PAGE_SHIFT
 7 
 8 #define ARM64_HW_PGTABLE_LEVEL_SHIFT(n)    ((PAGE_SHIFT - 3) * (4 - (n)) + 3)
 9 
10 #define PGDIR_SHIFT        ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - CONFIG_PGTABLE_LEVELS)
11 
12 #define EARLY_ENTRIES(vstart, vend, shift) (((vend) >> (shift)) \
13                     - ((vstart) >> (shift)) + 1)
14 
15 #define EARLY_PGDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT))
16 
17 #define PUD_SHIFT        ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
18 
19 #define SWAPPER_TABLE_SHIFT    PUD_SHIFT
20 
21 #define EARLY_PMDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, SWAPPER_TABLE_SHIFT))
22 
23 int main(int argc, const char *argv[])
24 {
25     unsigned long long a;
26 
27     unsigned long long start = 0xffff000008080000;
28     unsigned long long end = 0xffff000009536000;
29 
30     a = 1 + EARLY_PGDS(start, end) + EARLY_PMDS(start, end);
31 
32     printf("a: %llu\n", a);
33     return 0;
34 }

運行結果是3,所以SWAPPER_DIR_SIZE也是12KB,分別存放PGD、PUD和PMD表項,這個計算方法也容易理解,其中1表示存放level0 table需要1頁,EARLY_PGDS(start, end)計算映射(start, end)占用了level0 table中幾個表項,而每一個level0表項將來都會指向一個level1 table的物理首地址,每個level1 table占一頁,所以可以得到存放level1 table一共需要幾頁,EARLY_PMDS(start, end)用於計算映射(start, end)需要占用的level1 table的表項的總和,因為level1 table的每個表項都會指向一個level2 table的物理首地址,而每個level2 table也占一頁,所以可以得到存放level2 table一共需要幾頁

 
接着分析__create_page_tables:
    /*
     * Clear the idmap and swapper page tables.
     */
    adrp    x0, idmap_pg_dir
    adrp    x1, swapper_pg_end
    sub    x1, x1, x0
1:    stp    xzr, xzr, [x0], #16
    stp    xzr, xzr, [x0], #16
    stp    xzr, xzr, [x0], #16
    stp    xzr, xzr, [x0], #16
    subs    x1, x1, #64
    b.ne    1b

將存放轉換表的內存清空。

 
下面開始創建identity mapping:
 1     mov    x7, SWAPPER_MM_MMUFLAGS   // level2的block entry會用到
 2 
 3     adrp    x0, idmap_pg_dir
 4     adrp    x3, __idmap_text_start        // __pa(__idmap_text_start)
 5     adrp    x5, __idmap_text_end
 6     clz    x5, x5
 7     cmp    x5, TCR_T0SZ(VA_BITS)    // default T0SZ small enough?
 8     b.ge    1f            // .. then skip VA range extension
 9 
10     adr_l    x6, idmap_t0sz
11     str    x5, [x6]
12     dmb    sy
13     dc    ivac, x6        // Invalidate potentially stale cache line
14 
15     mov    x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
16     str_l    x4, idmap_ptrs_per_pgd, x5
17 
18 1:
19     ldr_l    x4, idmap_ptrs_per_pgd
20     mov    x5, x3                // __pa(__idmap_text_start)
21     adr_l    x6, __idmap_text_end        // __pa(__idmap_text_end)
22 
23     map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14

第23行的宏map_memory實現: 將虛擬地址[x3, x6]映射到(__idmap_text_start當前在物理內存中的地址)~(__idmap_text_end當前在物理內存中的地址),table從idmap_pg_dir當前所在的物理地址處開始存放。結合System.map,可以看到在這個范圍內包含下面的符號,目的是保證在開啟MMU的后,程序還可以正常運行:

ffff000008bdf000 T __idmap_text_start
ffff000008bdf000 T kimage_vaddr
ffff000008bdf008 T el2_setup
ffff000008bdf054 t set_hcr
ffff000008bdf128 t install_el2_stub
ffff000008bdf17c t set_cpu_boot_mode_flag
ffff000008bdf1a0 T secondary_holding_pen
ffff000008bdf1c4 t pen
ffff000008bdf1d8 T secondary_entry
ffff000008bdf1e4 t secondary_startup
ffff000008bdf1f4 t __secondary_switched
ffff000008bdf228 T __enable_mmu
ffff000008bdf284 t __no_granule_support
ffff000008bdf2a8 t __primary_switch
ffff000008bdf2c8 T cpu_resume
ffff000008bdf2e8 T __cpu_soft_restart
ffff000008bdf328 T cpu_do_resume
ffff000008bdf39c T idmap_cpu_replace_ttbr1
ffff000008bdf3d4 t __idmap_kpti_flag
ffff000008bdf3d8 T idmap_kpti_install_ng_mappings
ffff000008bdf414 t do_pgd
ffff000008bdf42c t next_pgd
ffff000008bdf438 t skip_pgd
ffff000008bdf46c t walk_puds
ffff000008bdf474 t do_pud
ffff000008bdf48c t next_pud
ffff000008bdf498 t skip_pud
ffff000008bdf4a8 t walk_pmds
ffff000008bdf4b0 t do_pmd
ffff000008bdf4c8 t next_pmd
ffff000008bdf4d4 t skip_pmd
ffff000008bdf4e4 t walk_ptes
ffff000008bdf4ec t do_pte
ffff000008bdf50c t skip_pte
ffff000008bdf51c t __idmap_kpti_secondary
ffff000008bdf564 T __cpu_setup
ffff000008bdf5f8 T __idmap_text_end

 

接下來是進行kernel  section mapping:
    adrp    x0, swapper_pg_dir
    mov_q    x5, KIMAGE_VADDR + TEXT_OFFSET    // compile time __va(_text)
    add    x5, x5, x23            // add KASLR displacement
    mov    x4, PTRS_PER_PGD
    adrp    x6, _end            // runtime __pa(_end)
    adrp    x3, _text            // runtime __pa(_text)
    sub    x6, x6, x3            // _end - _text
    add    x6, x6, x5            // runtime __va(_end)

    map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14

上面完成的工作是: 將kernel鏡像占用的虛擬地址空間[_text, _end]映射到當前kernel鏡像當前所在的物理內存地址空間上,table存放到swapper_pg_dir當前所在的物理內存地址處。

結合System.map可以看到,上面把kernel鏡像占用的內存全部映射了, 大約20MB左右
ffff000008080000 t _head
ffff000008080000 T _text
ffff000008080040 t pe_header
ffff000008080044 t coff_header
ffff000008080058 t optional_header
ffff000008080070 t extra_header_fields
ffff0000080800f8 t section_table
ffff000008081000 T __exception_text_start
ffff000008081000 T _stext
... ...
ffff000009536000 B _end
ffff000009536000 B swapper_pg_end    

到這里,可以得到如下映射關系:

 

下面結合kernel img的映射分析一下map_memory是如何做到的:
    adrp    x0, swapper_pg_dir
    mov_q    x5, KIMAGE_VADDR + TEXT_OFFSET    // compile time __va(_text)
    add    x5, x5, x23            // add KASLR displacement, 如果不支持內核鏡像加載地址隨機化,x23為0
    mov    x4, PTRS_PER_PGD   // 每個level0 table有一個表項,為1<<9
    adrp    x6, _end            // runtime __pa(_end)
    adrp    x3, _text            // runtime __pa(_text)
    sub    x6, x6, x3            // _end - _text
    add    x6, x6, x5            // runtime __va(_end)

    map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
結合注釋,x5和x6分別存放需要映射的虛擬地址的起始和結束地址,x7表示表項的flags,x3存放需要被映射的物理起始地址,x4存放一個level0 table包含的表項的個數(1<<9)。
由於后面kernel會自己重新再建立頁表,所以這里采用的映射比較粗糙,在level2 table里使用的是Block descriptor,每個block descriptor可以映射2MB物理地址,這樣只需要3個頁來就可以放下用於映射kernel鏡像的table(level0、level1和level2),如下圖:

 上面的map_memory就負責建立上圖中level0到level2的數據結構關系,沒有用到level3

ARM64提供了四種不同的descriptor type:
這里用到了Table descriptor和Block entry兩種。
 
下面是map_memory的實現:
/*
 * Map memory for specified virtual address range. Each level of page table needed supports
 * multiple entries. If a level requires n entries the next page table level is assumed to be
 * formed from n pages.
 *
 *    tbl:    location of page table
 *    rtbl:    address to be used for first level page table entry (typically tbl + PAGE_SIZE)
 *    vstart:    start address to map
 *    vend:    end address to map - we map [vstart, vend]
 *    flags:    flags to use to map last level entries
 *    phys:    physical address corresponding to vstart - physical memory is contiguous
 *    pgds:    the number of pgd entries
 *
 * Temporaries:    istart, iend, tmp, count, sv - these need to be different registers
 * Preserves:    vstart, vend, flags
 * Corrupts:    tbl, rtbl, istart, iend, tmp, count, sv
 */
    .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
    add \rtbl, \tbl, #PAGE_SIZE
    mov \sv, \rtbl
    mov \count, #0
    compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
    populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
    mov \tbl, \sv
    mov \sv, \rtbl

    compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
    populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
    mov \tbl, \sv

    compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
    bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
    populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
    .endm

其中涉及到兩個宏compute_indices和populate_entries,前者計算需要占用某個level的表項的索引范圍,后者用於填充被占用的那些表項。

 
下面是compute_indices實現:
/*
 * Compute indices of table entries from virtual address range. If multiple entries
 * were needed in the previous page table level then the next page table level is assumed
 * to be composed of multiple pages. (This effectively scales the end index).
 *
 *    vstart:    virtual address of start of range
 *    vend:    virtual address of end of range
 *    shift:    shift used to transform virtual address into index
 *    ptrs:    number of entries in page table
 *    istart:    index in table corresponding to vstart
 *    iend:    index in table corresponding to vend
 *    count:    On entry: how many extra entries were required in previous level, scales
 *              our end index.
 *        On exit: returns how many extra entries required for next page table level
 *
 * Preserves:    vstart, vend, shift, ptrs
 * Returns:    istart, iend, count
 */
    .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
    lsr    \iend, \vend, \shift
    mov    \istart, \ptrs
    sub    \istart, \istart, #1
    and    \iend, \iend, \istart    // iend = (vend >> shift) & (ptrs - 1)
    mov    \istart, \ptrs
    mul    \istart, \istart, \count
    add    \iend, \iend, \istart    // iend += (count - 1) * ptrs
                    // our entries span multiple tables

    lsr    \istart, \vstart, \shift
    mov    \count, \ptrs
    sub    \count, \count, #1
    and    \istart, \istart, \count

    sub    \count, \iend, \istart
    .endm

 

下面是populate_entries的實現:

/*
 * Macro to populate page table entries, these entries can be pointers to the next level
 * or last level entries pointing to physical memory.
 *
 *    tbl:    page table address
 *    rtbl:    pointer to page table or physical memory
 *    index:    start index to write
 *    eindex:    end index to write - [index, eindex] written to
 *    flags:    flags for pagetable entry to or in
 *    inc:    increment to rtbl between each entry
 *    tmp1:    temporary variable
 *
 * Preserves:    tbl, eindex, flags, inc
 * Corrupts:    index, tmp1
 * Returns:    rtbl
 */
    .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
.Lpe\@:    phys_to_pte \tmp1, \rtbl
    orr    \tmp1, \tmp1, \flags    // tmp1 = table entry
    str    \tmp1, [\tbl, \index, lsl #3]
    add    \rtbl, \rtbl, \inc    // rtbl = pa next level
    add    \index, \index, #1
    cmp    \index, \eindex
    b.ls    .Lpe\@
    .endm

 

如果將上面的操作轉換成C語言,就容易理解了:
void populate_entries(char *tbl, char **rtbl, int index, int eindex,
    int flags, int inc, char *tmp1)
{
    while (index <= eindex) {
        tmp1 = *rtbl;
        tmp1 = tmp1 | flags;
        *(tbl + index*8) = tmp1;

        *rtbl = *rtbl + inc;
        index++;
    }
}

void compute_indices (uint64_t vstart, uint64_t vend, int shift, int ptrs,
    int *istart, int *iend, int *count)
{
    *iend = vend >> shift;
    *istart = ptrs;
    *istart = *istart - 1;
    *iend = *iend & *istart; // 計算end index
    
    *istart = ptrs;
    *istart = (*istart) * (*count);
    *iend = *iend + *istart; // 由於*count是0,這里end index沒變變化

    *istart = vstart >> shift;
    *count = ptrs;
    *count = *count - 1;
    *istart = *istart & *count;  // 計算start index

    *count = *iend - *istart; // 計算需要的index的數量
}

void map_memory(char *tbl, char *rtbl, uint64_t vstart, uint64_t vend, int flags,
    uint64_t phys, int pgds, int istart, int iend, int tmp, int count, char *sv)
{
#define PAGE_SIZE (1 << 12)

    tbl = (char *)malloc(PAGE_SIZE * 3); // 用於存放level0~level2的table的緩沖區

    rtbl = tbl + PAGE_SIZE; // rtbl指向下一個level的table
    sv = rtbl;
    count = 0;

#define PGDIR_SHIFT (39)
#define PMD_TYPE_TABLE (0x3 << 0)  // 表示table descriptor
#define PGDS (1 << 9)

    compute_indices(vstart, vend, PGDIR_SHIFT, PGDS, &istart, &iend, &count);
    populate_entries(tbl, &rtbl, istart, iend, PMD_TYPE_TABLE, PAGE_SIZE, tmp);

    tbl = sv;
    sv = rtbl;

#define SWAPPER_TABLE_SHIFT (30)
#define PTRS_PER_PMD (1<<9)

    compute_indices(vstart, vend, SWAPPER_TABLE_SHIFT, PTRS_PER_PMD, &istart, &iend, &count);
    populate_entries(tbl, &rtbl, istart, iend, PMD_TYPE_TABLE, PAGE_SIZE, tmp); //table descriptor

    tbl = sv;

#define SWAPPER_BLOCK_SHIFT (21)
#define PTRS_PER_PTE (1 << 9)      //512
#define SWAPPER_BLOCK_SIZE (1<<21) //2MB
// 這里的flags是SWAPPER_MM_MMUFLAGS,為((4<<2) | ((1<<0) | (1<<10) | (3<<8))), 類型Block entry


    compute_indices(vstart, vend, SWAPPER_BLOCK_SHIFT, PTRS_PER_PTE, &istart, &iend, &count);
    count = phys ^ (SWAPPER_BLOCK_SIZE - 1);
    populate_entries(tbl, &count, istart, iend, flags, SWAPPER_BLOCK_SIZE, tmp);
}

由於我們編譯出來的kernel大概有20.7MB左右,所以用level0 table需要一項(512G),level1 table需要一項(1GB),level2 block需要11個(22MB)。

 

 

 
完。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM