昨晚腦子抽了,突然想扒扒 mmap
映射文件
剛剛開始我是嘗試 mmap 一個文件,實現對內存的讀寫達到讀寫文件內容的目的,踩了幾個坑
#include<stdio.h>
#include<unistd.h>
#include<sys/mman.h>
#include<sys/stat.h>
#include<sys/types.h>
#include<fcntl.h>
#include<string.h>
int main(int argc, char *argv[])
{
struct stat st;
void *ptr;
int filesize;
int fd = open("bin", O_RDWR);
fstat(fd, &st);
filesize = st.st_size;
printf("%d\n", filesize);
ptr = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
close(fd);
printf("%p\n", ptr);
memset(ptr, 0x46, 10);
msync(ptr, filesize, MS_SYNC);
munmap(ptr, filesize);
return 0;
}
-
我想的是讀寫,然后我看文章的時候我看到的是:用 open 得到一個 文件描述符,demo 上寫的是
int fd = open("xxxx", O_RDONLY);
我信了然后我也照寫,后面我發現我怎么都不能映射成功,后面我才意識到 open 時我用的只讀,然后 mmap 時用讀寫
ptr = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-
還有就是同步的問題,原本我的理解是對 ptr 指向的進行內存讀寫,系統會自動同步到文件,然而不是。一般來說,進程在映射空間中對共享內容的修改並不會直接寫回到磁盤文件中,往往在調用munmap()之后才會同步輸出到磁盤文件中.那么,在程序運行過 程中,在調用munmap()之前,可以通過調用msync()來實現磁盤上文件內容與共享內存區中的內容與一致,或者是把對共享內存區的修改同步輸出到 磁盤文件中。
代碼注入
完成后我還試了開一個 PROT_EXEC 權限的內存塊去做代碼注入
這里我又踩到坑,我在寫 shellcode 的時候,可能是我太 sb 了,我是這樣寫的:
section .data
msg db "hello, world!\r\n"
section .text
global _start
_start:
mov rax, 1
mov rdi, 1
mov rsi, msg
mov rdx, 13
syscall
mov rax, 60
mov rdi, 0
syscall
編譯
nasm -g -f elf64 shellcode.asm
ld shellcode.o -o shellocde
確實可以運行,然后我用 objdump 摳 shellcode 的時候發現,我沒法摳下 msg ,因為它在 data 段,我只能摳 text 段的,我又想到把 msg 放在 text 段,但是我怎么去拿到它的地址?
后面我才知道有個技巧
via:https://stackoverflow.com/questions/15593214/linux-shellcode-hello-world
然后我把代碼改成:
section .text
global _start
_start:
call _exec
msg db "hello, world!\r\n"
_exec:
mov rax, 1
mov rdi, 1
pop rsi
mov rdx, 13
syscall
mov rax, 60
mov rdi, 0
syscall
其實這個,利用了 call 指令把 msg 的地址壓棧
還是挺好理解的,就是 call 的時候會把下一條指令的地址當成 返回地址 壓棧,這樣在使用 pop 的時候就能得到這個地址
編譯,摳 代碼 :
echo "\xe8\x11\x00\x00\x00\x68\x65\x6c\x6c\x6f\x2c\x20\x77\x6f\x72\x6c\x64\x21\x5c\x72\x5c\x6e\xb8\x01\x00\x00\x00\xbf\x01\x00\x00\x00\x5e\xba\x0d\x00\x00\x00\x0f\x05\xb8\x3c\x00\x00\x00\xbf\x00\x00\x00\x00\x0f\x05" > elf64
c source:
#include<stdio.h>
#include<unistd.h>
#include<sys/mman.h>
#include<sys/stat.h>
#include<sys/types.h>
#include<fcntl.h>
#include<string.h>
int main(int argc, char *argv[])
{
typedef void (*shellcode_t)();
struct stat st;
void *ptr;
int filesize;
int fp = open("elf64", O_RDWR);
fstat(fp, &st);
filesize = st.st_size;
printf("%d\n", filesize);
ptr = mmap(NULL, filesize, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_SHARED, fp, 0);
close(fp);
printf("%p\n", ptr);
shellcode_t f = (shellcode_t)ptr;
f();
munmap(ptr, filesize);
return 0;
}
父子進程通訊
這里我用的是 mmap 匿名映射,所謂匿名映射,表示不存在 fd 這么個真實的文件,不需要映射真實文件
還有一種方法就是映射 /dev/zero
/dev/zero 是一個特殊的文件,當你讀它的時候,它會提供無限的空字符(NULL, ASCII NUL, 0x00)
一個作用是用它作為源,產生一個特定大小的空白文件
#include<stdio.h>
#include<unistd.h>
#include<sys/mman.h>
#include<sys/stat.h>
#include<sys/types.h>
int main(int argc, char *argv[])
{
char *ptr;
ptr = (char *)mmap(NULL, 100, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_ANON, -1, 0);
printf("%p\n", ptr);
if(fork() == 0) {
sleep(1);
printf("chlid got a message: %s\n", ptr);
sprintf(ptr, "%s", "I am your son");
munmap(ptr, 100);
}
sprintf(ptr, "%s" ,"I am your father");
sleep(3);
printf("father got a message: %s\n", ptr);
return 0;
}
這個原理就是,fork 是子進程會 復制(當然是寫時復制 COW,這個不展開) 父進程的東西,但是 mmap 得到的這塊內存是父子進程共同維護的,對這塊內存進行讀寫就能實現進程通信
源碼分析
用法大概試了個遍,我直接追源碼
寫了個 demo 配合自己編譯的 libc
#include<stdio.h>
#include<sys/mman.h>
#include<unistd.h>
int main()
{
void *p = mmap(NULL, 100, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
printf("%p\n", p);
return 0;
}
gdb 調試
其實在用戶層沒有什么東西,看 diasm 窗口最后就是使用一個系統調用:
0x7ffff7b0e8fa <mmap64+42> mov r9, rbx
0x7ffff7b0e8fd <mmap64+45> mov r8d, r15d
0x7ffff7b0e900 <mmap64+48> mov r10d, r14d
0x7ffff7b0e903 <mmap64+51> mov edx, r12d
0x7ffff7b0e906 <mmap64+54> mov rsi, r13
0x7ffff7b0e909 <mmap64+57> mov rdi, rbp
0x7ffff7b0e90c <mmap64+60> mov eax, 9
0x7ffff7b0e911 <mmap64+65> syscall
C source code:
位於 glibc 源碼: sysdeps/unix/sysv/linux/mmap64.c
via:https://code.woboq.org/userspace/glibc/sysdeps/unix/sysv/linux/mmap64.c.html#40
/* To avoid silent truncation of offset when using mmap2, do not accept
offset larger than 1 << (page_shift + off_t bits). For archictures with
32 bits off_t and page size of 4096 it would be 1^44. */
#define MMAP_OFF_HIGH_MASK \
((-(MMAP2_PAGE_UNIT << 1) << (8 * sizeof (off_t) - 1)))
#define MMAP_OFF_MASK (MMAP_OFF_HIGH_MASK | MMAP_OFF_LOW_MASK)
/* An architecture may override this. */
#ifndef MMAP_PREPARE
# define MMAP_PREPARE(addr, len, prot, flags, fd, offset)
#endif
void *
__mmap64 (void *addr, size_t len, int prot, int flags, int fd, off64_t offset)
{
MMAP_CHECK_PAGE_UNIT ();
if (offset & MMAP_OFF_MASK)
return (void *) INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
MMAP_PREPARE (addr, len, prot, flags, fd, offset);
#ifdef __NR_mmap2
return (void *) MMAP_CALL (mmap2, addr, len, prot, flags, fd,
(off_t) (offset / MMAP2_PAGE_UNIT));
#else
return (void *) MMAP_CALL (mmap, addr, len, prot, flags, fd, offset);
#endif
}
weak_alias (__mmap64, mmap64)
libc_hidden_def (__mmap64)
在用戶層就是那么多東西
就是一個 9 號系統調用
cat /usr/include/asm/unistd_64.h
查到
#define __NR_mmap 9
現在追進內核
9 號系統調用有 6 個參數(其實可以利用這個去查找因為是 6 個參數所以 搜索 SYSCALL_DEFINE6(mmap 這里的 6 就是描述參數 )
然后找到:
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}
這個方法用 vscode 的人比較方便操作
因為我是用 vim + gtags
在內核里面找系統調用
我用的是 5.4-rc4 的 kernel
我查看 arch/x86/entry/syscalls/syscall_64.tbl
9 common mmap __x64_sys_mmap
看到是 sys_mmap
直接跳到 sys_mmap 函數
arch/ia64/kernel/sys_ia64.c
via: https://elixir.bootlin.com/linux/v5.4-rc4/source/arch/ia64/kernel/sys_ia64.c#L149
asmlinkage unsigned long
sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
{
addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
if (!IS_ERR((void *) addr))
force_successful_syscall_return();
return addr;
}
asmlinkage unsigned long
sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
{
if (offset_in_page(off) != 0)
return -EINVAL;
addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
if (!IS_ERR((void *) addr))
force_successful_syscall_return();
return addr;
}
跟下去:
mm/mmap.c
via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/mmap.c#L1585
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval;
addr = untagged_addr(addr);
//判斷是不是匿名映射
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
if (!file)
return -EBADF;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file)));
retval = -EINVAL;
if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
goto out_fput;
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
len = ALIGN(len, huge_page_size(hs));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
if (file)
fput(file);
return retval;
}
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}
mm/util.c
via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/util.c#L483
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
//獲得 描述 當前進程 內存 的 mm_struct
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate, &uf);
up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
security_mmap_file
via:https://elixir.bootlin.com/linux/v5.4-rc4/source/security/security.c#L1445
int security_mmap_file(struct file *file, unsigned long prot,
unsigned long flags)
{
int ret;
//這個相當調用了 mmap_file 函數,這里的 call_int_hook 是這樣定義的:
/*
#define call_int_hook(FUNC, IRC, ...) ({ \
int RC = IRC; \
do { \
struct security_hook_list *P; \
\
hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \
RC = P->hook.FUNC(__VA_ARGS__); \
if (RC != 0) \
break; \
} \
} while (0); \
RC; \
})
首先是一個 security_hook_list 結構體
在里面找到了 security_list_options 結構體
在 security_list_options 里面有 mmap_file 函數指針
hlist_for_each_entry 哈希表
security_hook_heads 里面有 struct hlist_head mmap_file;
看的我煩死,反正就是他媽的調用了mmap_file,__VA_ARGS__,把后面的參數(...)傳給 mmap_file
看代碼比看我描述簡單
*/
ret = call_int_hook(mmap_file, 0, file, prot,
mmap_prot(file, prot), flags);
if (ret)
return ret;
return ima_file_mmap(file, prot);
}
mmap_prot (用來處理 prot 參數)
via:https://elixir.bootlin.com/linux/v5.4-rc4/source/security/security.c#L1412
static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
/*
* Does we have PROT_READ and does the application expect
* it to imply PROT_EXEC? If not, nothing to talk about...
*/
//自己看上面的英文注釋
if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
return prot;
if (!(current->personality & READ_IMPLIES_EXEC))
return prot;
/*
* if that's an anonymous mapping, let it.
*/
if (!file)
return prot | PROT_EXEC;
/*
* ditto if it's not on noexec mount, except that on !MMU we need
* NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
*/
if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
if (file->f_op->mmap_capabilities) {
unsigned caps = file->f_op->mmap_capabilities(file);
if (!(caps & NOMMU_MAP_EXEC))
return prot;
}
#endif
return prot | PROT_EXEC;
}
/* anything on noexec mount won't get PROT_EXEC */
return prot;
}
mmap_file
via:https://elixir.bootlin.com/linux/v5.4-rc4/source/scripts/recordmcount.c#L158
static void *mmap_file(char const *fname)
{
/* Avoid problems if early cleanup() */
fd_map = -1;
mmap_failed = 1;
file_map = NULL;
file_ptr = NULL;
file_updated = 0;
sb.st_size = 0;
fd_map = open(fname, O_RDONLY);
if (fd_map < 0) {
perror(fname);
return NULL;
}
if (fstat(fd_map, &sb) < 0) {
perror(fname);
goto out;
}
if (!S_ISREG(sb.st_mode)) {
fprintf(stderr, "not a regular file: %s\n", fname);
goto out;
}
file_map = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE,
fd_map, 0);
if (file_map == MAP_FAILED) {
mmap_failed = 1;
file_map = umalloc(sb.st_size);
if (!file_map) {
perror(fname);
goto out;
}
if (read(fd_map, file_map, sb.st_size) != sb.st_size) {
perror(fname);
free(file_map);
file_map = NULL;
goto out;
}
} else
mmap_failed = 0;
out:
close(fd_map);
fd_map = -1;
file_end = file_map + sb.st_size;
return file_map;
}
主要的函數調用路線是這樣的:
vm_mmap_pgoff -> do_mmap_pgoff -> do_mmap -> mmap_region
主要就是后面兩個函數
via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/mmap.c#L1397
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, vm_flags_t vm_flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
int pkey = 0;
*populate = 0;
if (!len)
return -EINVAL;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/
//判斷權限,看上面的注釋(懶得翻譯)
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
/* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* Careful about overflows.. */
/*判斷溢出*/
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (offset_in_page(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
struct vm_area_struct *vma = find_vma(mm, addr);
if (vma && vma->vm_start < addr + len)
return -EEXIST;
}
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (mlock_future_check(mm, vm_flags, len))
return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
/*
* Force use of MAP_SHARED_VALIDATE with non-legacy
* flags. E.g. MAP_SYNC is dangerous to use with
* MAP_SHARED as you don't know which consistency model
* you will get. We silently ignore unsupported flags
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
/* fall through */
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
if (prot & PROT_WRITE) {
if (!(file->f_mode & FMODE_WRITE))
return -EACCES;
if (IS_SWAPFILE(file->f_mapping->host))
return -ETXTBSY;
}
/*
* Make sure we don't allow writing to an append-only
* file..
*/
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
/*
* Make sure there are no mandatory locks on the file.
*/
if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
/* fall through */
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
break;
default:
return -EINVAL;
}
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
/*
* Ignore pgoff.
*/
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.
*/
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if (flags & MAP_NORESERVE) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
mmap_region
via:https://elixir.bootlin.com/linux/v5.4-rc4/source/mm/mmap.c#L1721
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
nr_pages = count_vma_pages_range(mm, addr, addr + len);
if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
/* Clear old maps */
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
}
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/*
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
}
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto allow_write_and_free_vma;
}
/* ->mmap() can change vma->vm_file, but must guarantee that
* vma_link() below can deny write-access if VM_DENYWRITE is set
* and map writably if VM_SHARED is set. This usually means the
* new file must not have been exposed to user-space, yet.
*/
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
} else {
vma_set_anonymous(vma);
}
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
}
file = vma->vm_file;
out:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
vma_set_page_prot(vma);
return addr;
unmap_and_free_vma:
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
這里涉及的面有點龐大,全部追蹤下去的話還會涉及 文件 的部分。我就簡單描述一下,其實 mmap 的最終 映射的內存是由 vm_area_struct 來描述,里面有兩個指針:vm_start, vm_end,描述映射區的起始地址和結束地址
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
里面提到 current- > mm 這個 current 就是描述當先進程信息的 task_struct