專題:Linux內存管理專題
關鍵詞:mm、vaddr、VMA、page、pfn、pte、paddr、pg_data、zone、mem_map[]。
1. 內存管理數據結構的關系圖
在大部分Linux系統中,內存設備的初始化一般是在BIOS或bootloader中,然后把DDR的大小傳遞給Linux內核。因此從Linux內核角度來看DDR,其實就是一段物理內存空間。
1.1 由mm數據結構和虛擬地址vaddr找到對應的VMA
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) { struct vm_area_struct * vma = find_vma(mm,start_addr); if (vma && end_addr <= vma->vm_start) vma = NULL; return vma; }
由VMA得出mm數據結構,struct vm_area_struct數據結構有一個指針指向struct mm_struct。
struct mm_struct { struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; u32 vmacache_seqnum; /* per-thread vmacache */... }; struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ unsigned long vm_start; /* Our start address within vm_mm. */ unsigned long vm_end; /* The first byte after our end address within vm_mm. */ ... struct rb_node vm_rb; ... struct mm_struct *vm_mm; /* The address space we belong to. */... }
find_vma()根據mm_struct和addr找到vma。
find_vma()首先在當前進程current->vmacache[]中查找addr對應的vma;如果未找到,則遍歷mm->mm_rb,通過rb_node找到對應的vma,然后判斷是否和addr吻合。
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct rb_node *rb_node; struct vm_area_struct *vma; /* Check the cache first. */ vma = vmacache_find(mm, addr);--------------------------------在task_struct->vmacache[]中查找,看能否命中。 if (likely(vma)) return vma; rb_node = mm->mm_rb.rb_node;----------------------------------找到當前mm的第一個rb_node節點。 vma = NULL; while (rb_node) {---------------------------------------------遍歷當前mm空間的所有rb_node。 struct vm_area_struct *tmp; tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);----通過rb_node找到對應的vma if (tmp->vm_end > addr) { vma = tmp; if (tmp->vm_start <= addr) break;--------------------------------------------找到合適的vma,退出。 rb_node = rb_node->rb_left; } else rb_node = rb_node->rb_right; } if (vma) vmacache_update(addr, vma);-------------------------------將當前vma放到vmacache[]中。 return vma; }
1.2 由page和VMA找到虛擬地址vaddr
vma_address()只針對匿名頁面:
inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { unsigned long address = __vma_address(page, vma); /* page should be within @vma mapping range */ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); return address; } static inline unsigned long __vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page_to_pgoff(page); return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);-------------根據page->index計算當前vma的偏移地址。 } static inline pgoff_t page_to_pgoff(struct page *page) { if (unlikely(PageHeadHuge(page))) return page->index << compound_order(page); else return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);------------------page->index表示在一個vma中page的index。 }
1.3 由page找到所有映射的VMA
通過反向映射rmap系統rmap_walk()來實現,對於匿名頁面來說是rmap_walk_anon()。
static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; anon_vma = rmap_walk_anon_lock(page, rwc);-----------------------------------由page->mapping找到anon_vma數據結構。 if (!anon_vma) return ret; pgoff = page_to_pgoff(page); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {-----遍歷anon_vma->rb_root紅黑樹,取出avc數據結構。 struct vm_area_struct *vma = avc->vma;----------------------------------每個avc數據結構指向每個映射的vma unsigned long address = vma_address(page, vma); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; if (rwc->done && rwc->done(page)) break; } anon_vma_unlock_read(anon_vma); return ret; }
由vma和虛擬地址vaddr找出相應的page數據結構,可以通過follow_page()。
static inline struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags)
follow_page()由虛擬地址vaddr通過查詢頁表找出pte。
由pte找出頁幀號pfn,然后在mem_map[]找到相應的struct page結構。
1.4 page和pfn之間的互換
page_to_pfn()和pfn_to_page()定義在linux/include/asm-generic/memory_mode.h中定義。
具體的實現方式跟memory models有關,這里定義了CONFIG_FLATMEM。
#define page_to_pfn __page_to_pfn #define pfn_to_page __pfn_to_page #define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET))-----------------------------pfn減去ARCH_PFN_OFFSET得到page相對於mem_map的偏移。 #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET)---------------------------------------------------------------page到mem_map的偏移加上ARCH_PFN_OFFSET得到當前page對應的pfn號。
在linux內核中,所有的物理內存都用struct page結構來描述,這些對象以數組形式存放,而這個數組的地址就是mem_map。
內核以節點node為單位,每個node下的物理內存統一管理,也就是說在表示內存node的描述類型struct pglist_data中,有node_mem_map這個成員。
也就是說,每個內存節點node下,成員node_mem_map是此node下所有內存以struct page描述后,所有這些對象的基地址,這些對象以數組形式存放。
1.5 pfn/page和paddr之間的互換
物理地址paddr和pfn的互換通過位移PAGE_SHIFT可以簡單的得到。
page和paddr的呼喚可以通過pfn中間來實現。
/* * Convert a physical address to a Page Frame Number and back */ #define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) #define __pfn_to_phys(pfn) ((phys_addr_t)(pfn) << PAGE_SHIFT) /* * Convert a page to/from a physical address */ #define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page))) #define phys_to_page(phys) (pfn_to_page(__phys_to_pfn(phys)))
1.6 page和pte之間的互換
先由page到pfn,然后由pfn到pte,可以實現page到pte的轉換。
#define pfn_pte(pfn,prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot)) #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET)
由pte到page,通過pte_pfn()找到對應的pfn號,再由pfn號找到對應的page。
#define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define pte_pfn(pte) ((pte_val(pte) & PHYS_MASK) >> PAGE_SHIFT)
1.7 zone和page之間的互換
由zone到page的轉換:zone數據結構有zone->start_pfn指向zone的起始頁面,然后由pfn找到page的數據結構。
由page到zone的轉換:page_zone()函數返回page所屬的zone,通過page->flags布局實現。
1.8 zone和pg_data之間的互換
由pg_data到zone:pg_data_t->node_zones。
由zone到pg_data:zone->zone_pgdat。
2. 內存管理中常用API
內存管理錯綜復雜,不僅要從用戶態的相關API來窺探和理解Linux內核內存是如何運作,還要總結Linux內核中常用的內存管理相關的API。
2.1 頁表相關
頁表相關的API可以概括為如下4類:頁表查詢、判斷頁表項的狀態位、修改頁表、page和pfn的關系。
//查詢頁表 #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) #define pgd_index(addr) ((addr) >> PGDIR_SHIFT) #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(pmd,addr) (pmd_page_vaddr(*(pmd)) + pte_index(addr)) #define pte_offset_map(pmd,addr) (__pte_map(pmd) + pte_index(addr)) #define pte_unmap(pte) __pte_unmap(pte) //判斷頁表項的狀態位 #define pte_none(pte) (!pte_val(pte)) #define pte_present(pte) (pte_isset((pte), L_PTE_PRESENT)) #define pte_valid(pte) (pte_isset((pte), L_PTE_VALID)) #define pte_accessible(mm, pte) (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) #define pte_write(pte) (pte_isclear((pte), L_PTE_RDONLY)) #define pte_dirty(pte) (pte_isset((pte), L_PTE_DIRTY)) #define pte_young(pte) (pte_isset((pte), L_PTE_YOUNG)) #define pte_exec(pte) (pte_isclear((pte), L_PTE_XN)) //修改頁表 #define mk_pte(page,prot) pfn_pte(page_to_pfn(page), prot) static inline pte_t pte_wrprotect(pte_t pte) { return set_pte_bit(pte, __pgprot(L_PTE_RDONLY)); } static inline pte_t pte_mkwrite(pte_t pte) { return clear_pte_bit(pte, __pgprot(L_PTE_RDONLY)); } static inline pte_t pte_mkclean(pte_t pte) { return clear_pte_bit(pte, __pgprot(L_PTE_DIRTY)); } static inline pte_t pte_mkdirty(pte_t pte) { return set_pte_bit(pte, __pgprot(L_PTE_DIRTY)); } static inline pte_t pte_mkold(pte_t pte) { return clear_pte_bit(pte, __pgprot(L_PTE_YOUNG)); } static inline pte_t pte_mkyoung(pte_t pte) { return set_pte_bit(pte, __pgprot(L_PTE_YOUNG)); } static inline pte_t pte_mkexec(pte_t pte) { return clear_pte_bit(pte, __pgprot(L_PTE_XN)); } static inline pte_t pte_mknexec(pte_t pte) { return set_pte_bit(pte, __pgprot(L_PTE_XN)); } static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { unsigned long ext = 0; if (addr < TASK_SIZE && pte_valid_user(pteval)) { if (!pte_special(pteval)) __sync_icache_dcache(pteval); ext |= PTE_EXT_NG; } set_pte_ext(ptep, pteval, ext); } static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { pte_val(pte) &= ~pgprot_val(prot); return pte; } static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) { pte_val(pte) |= pgprot_val(prot); return pte; } int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address); } return changed; } //page和pfn的關系 #define pte_pfn(pte) ((pte_val(pte) & PHYS_MASK) >> PAGE_SHIFT) #define pfn_pte(pfn,prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot))
2.2 內存分配
內核中常用的內存分配API如下。
分配和釋放頁面:
#define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); } unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; /* * __get_free_pages() returns a 32-bit address, which cannot represent * a highmem page */ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); __free_pages(virt_to_page((void *)addr), order); } } void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) free_hot_cold_page(page, false); else __free_pages_ok(page, order); } }
slab分配器:
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); void kmem_cache_free(struct kmem_cache *, void *); static __always_inline void *kmalloc(size_t size, gfp_t flags) static inline void kfree(void *p)
vmalloc相關:
extern void *vmalloc(unsigned long size); extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); extern void vunmap(const void *addr);
2.3 VMA操作相關
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) { struct vm_area_struct * vma = find_vma(mm,start_addr); if (vma && end_addr <= vma->vm_start) vma = NULL; return vma; } static inline unsigned long vma_pages(struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; }
2.4 頁面相關
內存管理的復雜之處是和頁面相關的操作,內核中常用的API函數歸納如下:PG_XXX標志位操作、page引用計數操作、匿名頁面和KSM頁面、頁面操作、頁面映射、缺頁中斷、LRU和頁面回收。
PG_XXX標志位操作:
PageXXX() SetPageXXX() ClearPageXXX() TestSetPageXXX() TestClearPageXXX() static inline void lock_page(struct page *page) { might_sleep(); if (!trylock_page(page)) __lock_page(page); } static inline int trylock_page(struct page *page) { return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } void wait_on_page_bit(struct page *page, int bit_nr); void wake_up_page(struct page *page, int bit) static inline void wait_on_page_locked(struct page *page) static inline void wait_on_page_writeback(struct page *page)
page引用計數操作:
static inline void get_page(struct page *page) void put_page(struct page *page); #define page_cache_get(page) get_page(page) #define page_cache_release(page) put_page(page) static inline int page_count(struct page *page) { return atomic_read(&compound_head(page)->_count); } static inline int page_mapcount(struct page *page) { VM_BUG_ON_PAGE(PageSlab(page), page); return atomic_read(&page->_mapcount) + 1; } static inline int page_mapped(struct page *page) { return atomic_read(&(page)->_mapcount) >= 0; } static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0, page); return atomic_dec_and_test(&page->_count); }
匿名頁面和KSM頁面:
static inline int PageAnon(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; } static inline int PageKsm(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); } struct address_space *page_mapping(struct page *page); static inline void *page_rmapping(struct page *page) { return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); } void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *);
頁面操作:
static inline struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte);
頁面映射:
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate); int do_munmap(struct mm_struct *, unsigned long, size_t); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t);
缺頁中斷:
static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags) static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte)
LRU和頁面回收:
void lru_cache_add(struct page *); #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags);