brk系統調用主要實現在mm/mmap.c函數中。
[mm/mmap.c]
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
bool populate;
down_write(&mm->mmap_sem);
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
if (current->brk_randomized)
min_brk = mm->start_brk;
else
min_brk = mm->end_data;
#else
min_brk = mm->start_brk;
#endif
if (brk < min_brk)
goto out;
/*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
mm->end_data, mm->start_data))
goto out;
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk)
goto set_brk;
/* Always allow shrinking brk. */
if (brk <= mm->brk) {
if (!do_munmap(mm, newbrk, oldbrk-newbrk))
goto set_brk;
goto out;
}
/* Check against existing mmap mappings. */
if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
goto out;
/* Ok, looks good - let it rip. */
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
goto out;
set_brk:
mm->brk = brk;
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
up_write(&mm->mmap_sem);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
out:
retval = mm->brk;
up_write(&mm->mmap_sem);
return retval;
}
在32位Linux內核中,每個用戶進程擁有3GB的虛擬空間。內核如何為用戶空間來划分這3GB的虛擬空間呢?用戶進程的可執行文件由代碼段和數據段組成,數據段包括所有靜態分配的數據空間,例如全局變量和靜態局部變量等。這些空間在可執行文件裝載時,內核就為其分配好這些空間,包括虛擬地址和物理頁面,並建立好兩者的映射關系。如圖2.15所示,用戶進程的用戶棧從3GB虛擬空間的頂部開始,由頂向下延伸,而brk分配的空間是從數據段的頂部end_data到用戶棧的底部。所以動態分配空間是從進程的end_data開始,每次分配一塊空間,就把這個邊界往上推進一段,同時內核和進程都會記錄當前邊界的位置。
第9行代碼,用戶進程的struct mm_struct數據結構有一個變量存放數據段的結束地址,如果brk請求的邊界小於這個地址,那么請求無效。mm->brk記錄動態分配區的當前底部,參數brk表示所要求的新邊界,是用戶進程分配內存大小與其動態分配區底部邊界相加。
如果新邊界小於老邊界,那么表示釋放空間,調用do_munmap()來釋放這一部分空間的內存。
find_vma_intersection()函數以老邊界oldbrk地址去查找系統中有沒有一塊已經存在的VMA,它通過find_vma()來查找當前用戶進程中是否有一個VMA和start_addr地址有重疊。
如果find_vma_intersection()找到一塊包含start_addr的VMA,說明老邊界開始的地址空間已經在使用了,就不需要再尋找了。
第34行代碼中do_brk()函數是這里的核心函數。
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
static unsigned long do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
unsigned long flags;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
len = PAGE_ALIGN(len);
if (!len)
return addr;
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
if (error & ~PAGE_MASK)
return error;
error = mlock_future_check(mm, mm->def_flags, len);
if (error)
return error;
/*
* mm->mmap_sem is required to protect against another thread
* changing the mappings in case we sleep.
*/
verify_mm_writelocked(mm);
/*
* Clear old maps. this also does some error checking for us
*/
munmap_back:
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
}
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL);
if (vma)
goto out;
/*
* create a vma struct for an anonymous mapping
*/
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
return addr;
}
在do_brk()函數中,申請分配內存大小要以頁面大小對齊。
第12行代碼,get_unmapped_area()函數用來判斷虛擬內存空間是否有足夠的空間,返回一段沒有映射過的空間的起始地址,這個函數會調用到具體的體系結構中實現。
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
int do_align = 0;
int aliasing = cache_is_vipt_aliasing();
struct vm_unmapped_area_info info;
/*
* We only need to do colour alignment if either the I or D
* caches alias.
*/
if (aliasing)
do_align = filp || (flags & MAP_SHARED);
/* requested length too big for entire address space */
if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED) {
if (aliasing && flags & MAP_SHARED &&
(addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))
return -EINVAL;
return addr;
}
/* requesting a specific address */
if (addr) {
if (do_align)
addr = COLOUR_ALIGN(addr, pgoff);
else
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = FIRST_USER_ADDRESS;
info.high_limit = mm->mmap_base;
info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
if (addr & ~PAGE_MASK) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
addr = vm_unmapped_area(&info);
}
return addr;
}
arch_get_unmapped_area_topdown()是ARM架構里get_unmapped_area()函數的實現,該函數留給讀者自行閱讀。
第20行代碼中的find_vma_links()函數之前已經閱讀過了,它循環遍歷用戶進程紅黑樹中的VMAs,然后根據addr來查找最合適的插入紅黑樹的節點,最終rb_link指針指向最合適節點rb_left或rb_right指針本身的地址。返回0表示查找最合適插入的節點,返回-ENOMEM表示和現有的VMA重疊,這時會調用do_munmap()函數來釋放這段重疊的空間。
do_brk()函數中第37行,vma_merge()函數去找有沒有可能合並addr附近的VMA。如果沒辦法合並,那么只能創建一個新的VMA,VMA的地址空間就是[addr, addr+len]。
第53行代碼,新創建的VMA需要加入到mm->mmap鏈表和紅黑樹中,vma_link()函數實現這個功能,該函數之前已經閱讀過了。
回到brk函數中,第39行代碼,這里判斷flags是否置位VM_LOCKED,這個VM_LOCKED通常從mlockall系統調用中設置而來。如果有,那么調用mm_populate()馬上分配物理內存並建立映射。通常用戶程序很少使用VM_LOCKED分配掩碼。所以brk不會為這個用戶進程立馬分配物理頁面,而是一直將分配物理頁面的工作推延到用戶進程需要訪問這些虛擬頁面,發生了缺頁中斷才會分配物理內存,並和虛擬地址建立映射關系。