我們很容易從一些Linux內核的書籍中知道X86架構使用2級( 10-10-12 )頁表,X86-64架構使用4級( 9-9-9-9-12 )頁表甚至是5級(在pgd_t與pud_t中間加了一層p4d_t),但是一些隱藏的問題卻往往被忽略,如每一個進程的頁表存儲在內核空間嗎?為什么內核中頁表所在頁框物理地址轉化為虛擬地址只需要加個偏置 PAGE_OFFSET?CR3寄存器內容和task_struct->mm->pgd都是全局頁表的物理地址嗎?一些頁表操作函數如pud_offset為什么使用的是經過__va()的地址以及為什么有了MMU還需要這些函數?
首先,如果你忘了多級頁表內存尋址的細節,下面這張圖可以很快讓你回憶起來。
以上以X86-64架構為例描述了一個4級頁表,需要注意的是Linux下邏輯地址與虛擬地址是一致的(各個段描述符Base均為0),p**_index()用於計算虛擬地址中每一級相對於頁目錄/表基址的索引或者偏置,而基址存儲在CR3寄存器或者上一級頁目錄項或者頁表項的物理地址字段中。
以下描述了內存尋址的特性:
- 內核給每一個進程分配頁表,頁表存儲在內核空間,當發生進程切換或者其它特定時間時,CR3寄存器裝載當前活動進程的全局頁表的物理基址。所以后續尋址虛擬地址使用的就是當前進程的頁表。
- CR3寄存器寫入值時會自動刷新TLB(轉換后援緩沖器)表項。
- CR3寄存器存儲的是進程頁全局目錄的物理基址,然而 task_struct->mm->pgd存儲的是進程頁全局目錄的虛擬地址。
- 每一個頁目錄項或者頁表項中的有一個40Bit(視內核版本不同稍有差異)的字段用於存儲下一級目錄的物理地址,然而如果在內核中要遍歷頁表,在開啟了MMU后,由於不能再使用物理地址,需要使用 "__va(x)" 將物理地址轉化為虛擬地址方可尋址,這個過程由MMU來完成。
- 虛擬地址的最低12位(4KB的頁大小)和物理地址的最低12位相同。虛擬地址的4個頁表段page_index可以看做在頁表中的索引。
- 上圖中的頁操作函數可供我們遍歷頁表,如通過 "current" 指針就可以得到進程描述符,然后得到內存描述符下的 "pgd" 指針,從而可以得到該虛擬地址對應的物理地址(即上圖中最后一層就是 page number + offset 得到物理地址),通過物理地址的前52位可以得到該物理地址所在頁的頁描述符,因為所有頁框的頁描述符是數組 mem_map[] 中的元素,數組的線性特性使得通過 page number 得到頁描述符變成可能。
- 我們可以使用一個簡單的例子做一個 a page table walk:
1 static unsigned long vaddr2paddr(unsigned long vaddr) 2 { 3 pgd_t *pgd; 4 p4d_t *p4d; 5 pud_t *pud; 6 pmd_t *pmd; 7 pte_t *pte; 8 unsigned long paddr = 0; 9 unsigned long page_addr = 0; 10 unsigned long page_offset = 0; 11 12 pgd = pgd_offset(current->mm, vaddr); 13 if (!pgtable_l5_enabled()) 14 printk("pgtable_l5 is not enabled\n"); 15 p4d = p4d_offset(pgd, vaddr); 16 pud = pud_offset(p4d, vaddr); 17 pmd = pmd_offset(pud, vaddr); 18 pte = pte_offset_kernel(pmd, vaddr); 19 page_addr = pte_val(*pte) & PAGE_MASK; 20 page_offset = vaddr & ~PAGE_MASK; 21 paddr = page_addr | page_offset; 22 23 return paddr; 24 }

1 #include <linux/module.h> 2 #include <linux/init.h> 3 #include <linux/kernel.h> 4 #include <asm/pgtable.h> 5 #include <asm/page.h> 6 #include <linux/sched.h> 7 8 unsigned long vaddr = 0; 9 10 MODULE_LICENSE("GPL"); 11 MODULE_AUTHOR("ShieldQiQi"); 12 MODULE_DESCRIPTION("Test page table walk"); 13 14 static void get_pgtable_macro(void) 15 { 16 printk("PAGE_OFFSET = 0x%lx\n", PAGE_OFFSET); 17 printk("PGDIR_SHIFT = %d\n", PGDIR_SHIFT); 18 printk("P4D_SHIFT = %d\n", P4D_SHIFT); 19 printk("PUD_SHIFT = %d\n", PUD_SHIFT); 20 printk("PMD_SHIFT = %d\n", PMD_SHIFT); 21 printk("PAGE_SHIFT = %d\n", PAGE_SHIFT); 22 23 printk("PTRS_PER_PGD = %d\n", PTRS_PER_PGD); 24 printk("PTRS_PER_P4D = %d\n", PTRS_PER_P4D); 25 printk("PTRS_PER_PUD = %d\n", PTRS_PER_PUD); 26 printk("PTRS_PER_PMD = %d\n", PTRS_PER_PMD); 27 printk("PTRS_PER_PTE = %d\n", PTRS_PER_PTE); 28 29 printk("PAGE_MASK = 0x%lx\n", PAGE_MASK); 30 } 31 32 static unsigned long vaddr2paddr(unsigned long vaddr) 33 { 34 pgd_t *pgd; 35 p4d_t *p4d; 36 pud_t *pud; 37 pmd_t *pmd; 38 pte_t *pte; 39 unsigned long paddr = 0; 40 unsigned long page_addr = 0; 41 unsigned long page_offset = 0; 42 43 pgd = pgd_offset(current->mm, vaddr); 44 printk("current->mm->pgd = 0x%lx\n", (unsigned long)current->mm->pgd); 45 printk("pgd = 0x%lx\n", (unsigned long)pgd); 46 printk("pgd_val = 0x%lx\n", pgd_val(*pgd)); 47 printk("pgd_index = %lu\n", pgd_index(vaddr)); 48 if (pgd_none(*pgd)) { 49 printk("not mapped in pgd\n"); 50 return -1; 51 } 52 53 if (!pgtable_l5_enabled()) 54 printk("pgtable_l5 is not enabled\n"); 55 56 p4d = p4d_offset(pgd, vaddr); 57 printk("p4d_val = 0x%lx\n", p4d_val(*p4d)); 58 printk("p4d_index = %lu\n", p4d_index(vaddr)); 59 if (p4d_none(*p4d)) { 60 printk("not mapped in p4d\n"); 61 return -1; 62 } 63 64 pud = pud_offset(p4d, vaddr); 65 printk("p4d_pfn_mask = 0x%lx\n", p4d_pfn_mask(*p4d)); 66 printk("p4d_page_vaddr = 0x%lx\n", p4d_page_vaddr(*p4d)); 67 printk("pud_index = 0x%lx\n", pud_index(vaddr)); 68 printk("pud = 0x%lx\n", (unsigned long)pud); 69 70 printk("pud_val = 0x%lx\n", pud_val(*pud)); 71 if (pud_none(*pud)) { 72 printk("not mapped in pud\n"); 73 return -1; 74 } 75 76 pmd = pmd_offset(pud, vaddr); 77 printk("pmd_val = 0x%lx\n", pmd_val(*pmd)); 78 printk("pmd_index = %lu\n", pmd_index(vaddr)); 79 printk("pmd = 0x%lx\n", (unsigned long)pmd); 80 if (pmd_none(*pmd)) { 81 printk("not mapped in pmd\n"); 82 return -1; 83 } 84 85 pte = pte_offset_kernel(pmd, vaddr); 86 printk("pte = 0x%lx\n", (unsigned long)pte); 87 printk("pte_val = 0x%lx\n", pte_val(*pte)); 88 printk("pte_index = %lu\n", pte_index(vaddr)); 89 if (pte_none(*pte)) { 90 printk("not mapped in pte\n"); 91 return -1; 92 } 93 94 /* Page frame physical address mechanism | offset */ 95 page_addr = pte_val(*pte) & PAGE_MASK; 96 page_offset = vaddr & ~PAGE_MASK; 97 paddr = page_addr | page_offset; 98 printk("page_addr = %lx, page_offset = %lx\n", page_addr, page_offset); 99 printk("vaddr = %lx, paddr = %lx\n", vaddr, paddr); 100 101 return paddr; 102 } 103 104 static int __init v2p_init(void) 105 { 106 107 printk("vaddr to paddr module is running..\n"); 108 get_pgtable_macro(); 109 printk("\n"); 110 111 vaddr = (unsigned long)vmalloc(1000 * sizeof(char)); 112 if (vaddr == 0) { 113 printk("vmalloc failed..\n"); 114 return 0; 115 } 116 printk("vmalloc_vaddr=0x%lx\n", vaddr); 117 vaddr2paddr(vaddr); 118 vfree((void *)vaddr); 119 120 printk("\n\n"); 121 vaddr = __get_free_page(GFP_KERNEL); 122 if (vaddr == 0) { 123 printk("__get_free_page failed..\n"); 124 return 0; 125 } 126 printk("get_page_vaddr=0x%lx\n", vaddr); 127 vaddr2paddr(vaddr); 128 free_page(vaddr); 129 130 return 0; 131 } 132 133 static void __exit v2p_exit(void) 134 { 135 printk("vaddr to paddr module is leaving..\n"); 136 } 137 138 module_init(v2p_init); 139 module_exit(v2p_exit);
- 如果你深入的看 "p**_offset" 是如何實現的就會發現,它由一個當前頁表所在頁框的虛擬地址加上 "p**_index" 得到,這里有一個誤區,在Linux內核5.4.0中, "p**_offset" 實現如下:
1 static inline unsigned long pud_page_vaddr(pud_t pud) 2 { 3 return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); 4 } 5 6 /* 7 * Currently stuck as a macro due to indirect forward reference to 8 * linux/mmzone.h's __section_mem_map_addr() definition: 9 */ 10 #define pud_page(pud) pfn_to_page(pud_pfn(pud)) 11 12 /* Find an entry in the second-level page table.. */ 13 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) 14 { 15 return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); 16 }
- 可以看到 "pmd_offset" 最終返回的是__va()之后的虛擬地址,所以函數體內的入參 "pud_t *pud" 其實也是虛擬地址,函數 "pud_page_vaddr"用於取出頁上級目錄項存儲的內容,可以看到使用的是取地址符號 " * ",看到這里有些小伙伴可能會問為什么已經知道的物理地址,還要使用經過__va()得到的虛擬地址 "pud" 呢,甚至可能會說我們現在做的不就是虛擬地址轉化為物理地址嗎,那么怎么可以直接使用虛擬地址呢?其實,當我們用到這些宏的時候,系統早已正常工作,cpu所處理的一切地址都應該是虛擬地址,轉化的事情就交給MMU好了,之所以使用虛擬地址,是因為只能使用虛擬地址。
- 上述代碼通過
page_offset = vaddr & ~PAGE_MASK
- 得到的物理地址其實是不對的,因為只有中間的40位才是物理地址 page number
以上例子的實際輸出如下:

[598341.980621] vaddr to paddr module is running.. [598341.980622] PAGE_OFFSET = 0xffff90a240000000 [598341.980623] PGDIR_SHIFT = 39 [598341.980623] P4D_SHIFT = 39 [598341.980623] PUD_SHIFT = 30 [598341.980624] PMD_SHIFT = 21 [598341.980624] PAGE_SHIFT = 12 [598341.980624] PTRS_PER_PGD = 512 [598341.980624] PTRS_PER_P4D = 1 [598341.980625] PTRS_PER_PUD = 512 [598341.980625] PTRS_PER_PMD = 512 [598341.980625] PTRS_PER_PTE = 512 [598341.980626] PAGE_MASK = 0xfffffffffffff000 [598341.980628] vmalloc_vaddr=0xffffbaddc02cb000 [598341.980628] current->mm->pgd = 0xffff90a3c83e4000 [598341.980629] pgd = 0xffff90a3c83e4ba8 [598341.980629] pgd_val = 0x2b5155067 [598341.980629] pgd_index = 373 [598341.980630] pgtable_l5 is not enabled [598341.980630] p4d_val = 0x2b5155067 [598341.980630] p4d_index = 0 [598341.980631] p4d_pfn_mask = 0xffffffffff000 [598341.980631] p4d_page_vaddr = 0xffff90a4f5155000 [598341.980631] pud_index = 0x177 [598341.980631] pud = 0xffff90a4f5155bb8 [598341.980632] pud_val = 0x2b5158067 [598341.980632] pmd_val = 0x2b4a9e067 [598341.980632] pmd_index = 1 [598341.980633] pmd = 0xffff90a4f5158008 [598341.980633] pte = 0xffff90a4f4a9e658 [598341.980633] pte_val = 0x8000000204610063 [598341.980634] pte_index = 203 [598341.980634] page_addr = 8000000204610000, page_offset = 0 [598341.980634] vaddr = ffffbaddc02cb000, paddr = 8000000204610000 [598341.980635] [598341.980635] get_page_vaddr=0xffff90a444610000 [598341.980636] current->mm->pgd = 0xffff90a3c83e4000 [598341.980636] pgd = 0xffff90a3c83e4908 [598341.980636] pgd_val = 0x1cfe01067 [598341.980636] pgd_index = 289 [598341.980637] pgtable_l5 is not enabled [598341.980637] p4d_val = 0x1cfe01067 [598341.980637] p4d_index = 0 [598341.980638] p4d_pfn_mask = 0xffffffffff000 [598341.980638] p4d_page_vaddr = 0xffff90a40fe01000 [598341.980638] pud_index = 0x91 [598341.980638] pud = 0xffff90a40fe01488 [598341.980639] pud_val = 0x24baa9063 [598341.980639] pmd_val = 0x204680063 [598341.980639] pmd_index = 35 [598341.980640] pmd = 0xffff90a48baa9118 [598341.980640] pte = 0xffff90a444680080 [598341.980640] pte_val = 0x8000000204610063 [598341.980640] pte_index = 16 [598341.980641] page_addr = 8000000204610000, page_offset = 0 [598341.980641] vaddr = ffff90a444610000, paddr = 8000000204610000 [598346.531714] vaddr to paddr module is leaving..
再更...如有錯誤請指出改正,感謝!