VMA概述
进程低3G用作进程的用户空间,一般的进程用不了这么大的空间,在程序运行过程中会通过malloc、mmap等函数动态申请虚拟内存空间,函数会返回一个虚拟地址给用户。因此内核必然要对内次申请的虚拟地址块进行管理,以跟踪虚拟内存的使用情况,并在访问虚拟地址时将其映射到物理内存。
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
* space that has a special rule for the page-fault handlers (ie a shared
* library, the executable area etc).
*/
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
unsigned long vm_start; /* Our start address within vm_mm. */
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next, *vm_prev;
struct rb_node vm_rb;
/*
* Largest free memory gap in bytes to the left of this VMA.
* Either between this VMA and vma->vm_prev, or between one of the
* VMAs below us in the VMA rbtree and its ->vm_prev. This helps
* get_unmapped_area find a free area of the right size.
*/
unsigned long rb_subtree_gap;
/* Second cache line starts here. */
struct mm_struct *vm_mm; /* The address space we belong to. */
/*
* Access permissions of this VMA.
* See vmf_insert_mixed_prot() for discussion.
*/
pgprot_t vm_page_prot;
unsigned long vm_flags; /* Flags, see mm.h. */
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
*
* For private anonymous mappings, a pointer to a null terminated string
* containing the name given to the vma, or NULL if unnamed.
*/
union {
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
/*
* Serialized by mmap_sem. Never use directly because it is
* valid only when vm_file is NULL. Use anon_vma_name instead.
*/
struct anon_vma_name *anon_name;
};
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_chain; /* Serialized by mmap_lock &
* page_table_lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units */
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
#ifdef CONFIG_SWAP
atomic_long_t swap_readahead_info;
#endif
#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
}
#define VM_NONE 0x00000000 //无标志
#define VM_READ 0x00000001 //可读
#define VM_WRITE 0x00000002 //可写
#define VM_EXEC 0x00000004 //可操作
#define VM_SHARED 0x00000008 //允许被多个线程访问
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD 0x00000010 //允许设置可读
#define VM_MAYWRITE 0x00000020 //允许设置可写
#define VM_MAYEXEC 0x00000040 //允许设置可操作
#define VM_MAYSHARE 0x00000080 //允许设置可共享
#define VM_GROWSDOWN 0x00000100 //向低地址增长
#define VM_UFFD_MISSING 0x00000200
#define VM_PFNMAP 0x00000400
#define VM_DENYWRITE 0x00000800 //不允许写入
#define VM_UFFD_WP 0x00001000
#define VM_LOCKED 0x00002000 //VMA被锁定,不会被交换到交换分区
#define VM_IO 0x00004000 //该VMA用于IO 映射
#define VM_SEQ_READ 0x0000800 //应用程序会顺序读该VMA的内容
#define VM_RAND_READ 0x00010000 //应用程序会随机读取该VMA内存
#define VM_DONTCOPY 0x00020000 //fork时不会复制该VMA
#define VM_DONTEXPAND 0x00040000
#define VM_LOCKONFAULT 0x00080000 //当处于page fault时锁定该VMA 对应的物理页
#define VM_ACCOUNT 0x00100000
#define VM_NORESERVE 0x00200000 //不做保留
#define VM_HUGETLB 0x00400000 //huge TLB page对应的VM
#define VM_SYNC 0x00800000 //page fault时同步映射
#define VM_ARCH_1 0x01000000
#define VM_WIPEONFORK 0x02000000 //不会从父进程相应的VMA中复制页表到子进程的VMA中
#define VM_DONTDUMP 0x04000000 //VMA不会被包含到core dump中
#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY 0x08000000
#else
# define VM_SOFTDIRTY 0
#endif
#define VM_MIXEDMAP 0x10000000
#define VM_HUGEPAGE 0x20000000
#define VM_NOHUGEPAGE 0x40000000
#define VM_MERGEABLE 0x80000000
- vm_start:虚拟地址区域的起始地址
- vm_end:虚拟地址区域的结束地址
- vm_next、vm_pre:VMA链表指针
- vm_flags:VMA状态标志
- vm_page_prot:VMA访问权限,代码段可读可执行蛋不可写,数据段可写可读不可执行
- vm_mm:执行上级的mm_struct结构
- anon_vma_chain、anon_vma:匿名页反向映射会用到,在内核中匿名页会构成一个基数树
- vm_ops:虚拟地址空间操作函数列表,通常用于文件映射
- vm_file:如果是通过mmap进行的文件映射,这个字段执行struct file结构
- vm_pgoff:此内存区域在文件中的页面偏移,以页为单位。
- vm_rb:红黑树节点,每个mm_struct中都有一个红黑树根节点,当进程的VMA数量很多时可以遍历这个红黑树快速定位
- shared.rb:对于文件映射,链入address_space->i_mmap树,反向映射会用到
内核中使用vm_area_struct(简写为VMA)结构对虚拟地址空间进行管理,每个进程的task_struct结构中有一个指向mm_struct结构的字段mm,mm_struct中的mmap字段指向进程中的VMA链表,同时还有一个红黑树来加快VMA的查找速度。不仅进程通过malloc、mmap申请的内存块会用VMA管理,进程本身的代码段、data段、栈也也在其管理之下,malloc申请的虚拟地址空间在堆中,mmap使用的地址空间在mapping中。
子进程在fork后通过copy_mm函数复制父进程的VMA相关结构,当子进程值只读VMA时和父进程共用相同的物理内存,需要写时候通过COW机制才为子进程重新分配内存页面,将VMA空间映射到新页面。
VMA管理
查找VMA
给定一个虚拟地址addr,通过搜索VMA链表查找这个地址落在哪个VMA地址空间中时内核中常见的操作,内核通过find_vma函数来实现。find_vma()函数根据给定地址addr查找满足如下条件之一的 VMA。
- 线性地址满足条件vma->vm_start <= addr < vma->vm_end,落在一个VMA地址空间
- 距离addr最近并且VMA的结束地址大于addr的一个VMA
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct rb_node *rb_node;
struct vm_area_struct *vma;
mmap_assert_locked(mm);
/* Check the cache first. */
vma = vmacache_find(mm, addr);
if (likely(vma))
return vma;
rb_node = mm->mm_rb.rb_node;
while (rb_node) {
struct vm_area_struct *tmp;
tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
if (tmp->vm_end > addr) {
vma = tmp;
if (tmp->vm_start <= addr)
break;
rb_node = rb_node->rb_left;
} else
rb_node = rb_node->rb_right;
}
if (vma)
vmacache_update(addr, vma);
return vma;
}
查找是先通过vmacache_find函数在VMA缓存中查找,VMA缓存会记录最近访问的VMA指针,每次查到新的VMA将更新缓存,这样可以加快查找速度。缓存中查不到时获取mm_struct中VMA红黑树的根节点,在红黑树中遍历查找VMM。
内核中还有另外一个查找函数find_vma_intersection函数,这个函数可以查找和给定地址区域由重合的VMA。find_vma_prev在返回满足地址VMA的同时,还返回前一个VMA。
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
unsigned long start_addr,
unsigned long end_addr)
{
struct vm_area_struct *vma = find_vma(mm, start_addr);
if (vma && end_addr <= vma->vm_start)
vma = NULL;
return vma;
}
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (vma) {
*pprev = vma->vm_prev;
} else {
struct rb_node *rb_node = rb_last(&mm->mm_rb);
*pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
插入VMA
按地址大小插入VMA链表时通过insert_vm_struct函数实现的。
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
* then i_mmap_rwsem is taken here.
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
&prev, &rb_link, &rb_parent))
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
* until its first write fault, when page's anon_vma and index
* are set. But now set the vm_pgoff it will almost certainly
* end up with (unless mremap moves it elsewhere before that
* first wfault), so /proc/pid/maps tells a consistent story.
*
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap and in do_brk_flags.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
- find_vma_links主要工作是从小地址开始查找一个足够容纳新vma结构的地址空间,如果发现现存的vma和新的地址重合则返回错误,否测返回新地址空间前一个vma的链表信息和红黑树信息。
- 设置了VM_ACCOUNT标志后调用security_vm_enough_memory_mm函数产看是否进程由足够的内存可用,大多数用户空间映射都设置了VM_ACCOUNT标志。
- 如果是匿名映射将vm_pgoff指向区域首地址从0开始的偏移页
- vma_link将新VMA插入量表和红黑树,如果是文件映射则还要将VMA加入到address_space中的i_mapping基数树(Radix Tree)中。
合并VMA
在新的 VMA 被加入到进程的地址空间时,内核会检查它是否可以与一个或多个现存的 VMA 进行合并。vma_merge()函数实现将一个新的 VMA 和附近的 VMA 合并功能。合并要满足很多条件,以此代码里做了大量的效验工作。
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
int err;
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
*/
if (vm_flags & VM_SPECIAL)
return NULL;
next = vma_next(mm, prev);
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
VM_WARN_ON(area && end > area->vm_end);
VM_WARN_ON(addr >= end);
/*
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
prev);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL, prev);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
return prev;
}
/*
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL, next);
/*
* In case 3 area is already equal to next and
* this is a noop, but in case 8 "area" has
* been removed and next was expanded over it.
*/
area = next;
}
if (err)
return NULL;
khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
return NULL;
}
本文中部分图片参考来及互联网在此标识感谢!