Linux虚拟内存空间管理-VMA

VMA概述

进程低3G用作进程的用户空间，一般的进程用不了这么大的空间，在程序运行过程中会通过malloc、mmap等函数动态申请虚拟内存空间，函数会返回一个虚拟地址给用户。因此内核必然要对内次申请的虚拟地址块进行管理，以跟踪虚拟内存的使用情况，并在访问虚拟地址时将其映射到物理内存。

/*
 * This struct describes a virtual memory area. There is one of these
 * per VM-area/task. A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
    /* The first cache line has the info for VMA tree walking. */

    unsigned long vm_start;     /* Our start address within vm_mm. */
    unsigned long vm_end;       /* The first byte after our end address
                       within vm_mm. */

    /* linked list of VM areas per task, sorted by address */
    struct vm_area_struct *vm_next, *vm_prev;

    struct rb_node vm_rb;

    /*
     * Largest free memory gap in bytes to the left of this VMA.
     * Either between this VMA and vma->vm_prev, or between one of the
     * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
     * get_unmapped_area find a free area of the right size.
     */
    unsigned long rb_subtree_gap;

    /* Second cache line starts here. */

    struct mm_struct *vm_mm;    /* The address space we belong to. */

    /*
     * Access permissions of this VMA.
     * See vmf_insert_mixed_prot() for discussion.
     */
    pgprot_t vm_page_prot;
    unsigned long vm_flags;     /* Flags, see mm.h. */

    /*
     * For areas with an address space and backing store,
     * linkage into the address_space->i_mmap interval tree.
     *
     * For private anonymous mappings, a pointer to a null terminated string
     * containing the name given to the vma, or NULL if unnamed.
     */

    union {
        struct {
            struct rb_node rb;
            unsigned long rb_subtree_last;
        } shared;
        /*
         * Serialized by mmap_sem. Never use directly because it is
         * valid only when vm_file is NULL. Use anon_vma_name instead.
         */
        struct anon_vma_name *anon_name;
    };

    /*
     * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
     * list, after a COW of one of the file pages.  A MAP_SHARED vma
     * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
     * or brk vma (with NULL file) can only be in an anon_vma list.
     */
    struct list_head anon_vma_chain; /* Serialized by mmap_lock &
                      * page_table_lock */
    struct anon_vma *anon_vma;  /* Serialized by page_table_lock */

    /* Function pointers to deal with this struct. */
    const struct vm_operations_struct *vm_ops;

    /* Information about our backing store: */
    unsigned long vm_pgoff;     /* Offset (within vm_file) in PAGE_SIZE
                       units */
    struct file * vm_file;      /* File we map to (can be NULL). */
    void * vm_private_data;     /* was vm_pte (shared mem) */

#ifdef CONFIG_SWAP
    atomic_long_t swap_readahead_info;
#endif
#ifndef CONFIG_MMU
    struct vm_region *vm_region;    /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
    struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
#endif
    struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
}

#define VM_NONE     0x00000000  //无标志
#define VM_READ     0x00000001  //可读
#define VM_WRITE    0x00000002  //可写
#define VM_EXEC     0x00000004  //可操作
#define VM_SHARED   0x00000008  //允许被多个线程访问

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD  0x00000010  //允许设置可读
#define VM_MAYWRITE 0x00000020  //允许设置可写  
#define VM_MAYEXEC  0x00000040  //允许设置可操作
#define VM_MAYSHARE 0x00000080  //允许设置可共享

#define VM_GROWSDOWN    0x00000100  //向低地址增长
#define VM_UFFD_MISSING 0x00000200  
#define VM_PFNMAP   0x00000400  
#define VM_DENYWRITE    0x00000800  //不允许写入
#define VM_UFFD_WP  0x00001000  
#define VM_LOCKED   0x00002000  //VMA被锁定，不会被交换到交换分区
#define VM_IO           0x00004000  //该VMA用于IO 映射

#define VM_SEQ_READ 0x0000800   //应用程序会顺序读该VMA的内容
#define VM_RAND_READ    0x00010000  //应用程序会随机读取该VMA内存
#define VM_DONTCOPY 0x00020000  //fork时不会复制该VMA
#define VM_DONTEXPAND   0x00040000  
#define VM_LOCKONFAULT  0x00080000  //当处于page fault时锁定该VMA 对应的物理页
#define VM_ACCOUNT  0x00100000  
#define VM_NORESERVE    0x00200000  //不做保留
#define VM_HUGETLB  0x00400000  //huge TLB page对应的VM
#define VM_SYNC     0x00800000  //page fault时同步映射
#define VM_ARCH_1   0x01000000  
#define VM_WIPEONFORK   0x02000000  //不会从父进程相应的VMA中复制页表到子进程的VMA中
#define VM_DONTDUMP 0x04000000  //VMA不会被包含到core dump中

#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY   0x08000000  
#else
# define VM_SOFTDIRTY   0
#endif

#define VM_MIXEDMAP 0x10000000  
#define VM_HUGEPAGE 0x20000000  
#define VM_NOHUGEPAGE   0x40000000  
#define VM_MERGEABLE    0x80000000

vm_start：虚拟地址区域的起始地址
vm_end：虚拟地址区域的结束地址
vm_next、vm_pre：VMA链表指针
vm_flags：VMA状态标志
vm_page_prot：VMA访问权限，代码段可读可执行蛋不可写，数据段可写可读不可执行
vm_mm：执行上级的mm_struct结构
anon_vma_chain、anon_vma：匿名页反向映射会用到，在内核中匿名页会构成一个基数树
vm_ops：虚拟地址空间操作函数列表，通常用于文件映射
vm_file：如果是通过mmap进行的文件映射，这个字段执行struct file结构
vm_pgoff：此内存区域在文件中的页面偏移，以页为单位。
vm_rb：红黑树节点，每个mm_struct中都有一个红黑树根节点，当进程的VMA数量很多时可以遍历这个红黑树快速定位
shared.rb：对于文件映射，链入address_space->i_mmap树，反向映射会用到

内核中使用vm_area_struct(简写为VMA)结构对虚拟地址空间进行管理，每个进程的task_struct结构中有一个指向mm_struct结构的字段mm，mm_struct中的mmap字段指向进程中的VMA链表,同时还有一个红黑树来加快VMA的查找速度。不仅进程通过malloc、mmap申请的内存块会用VMA管理，进程本身的代码段、data段、栈也也在其管理之下，malloc申请的虚拟地址空间在堆中，mmap使用的地址空间在mapping中。

mm_struct-vm_area_struct结构图.jpg

子进程在fork后通过copy_mm函数复制父进程的VMA相关结构，当子进程值只读VMA时和父进程共用相同的物理内存，需要写时候通过COW机制才为子进程重新分配内存页面，将VMA空间映射到新页面。

VMA管理

查找VMA

给定一个虚拟地址addr，通过搜索VMA链表查找这个地址落在哪个VMA地址空间中时内核中常见的操作，内核通过find_vma函数来实现。find_vma()函数根据给定地址addr查找满足如下条件之一的 VMA。

线性地址满足条件vma->vm_start <= addr < vma->vm_end，落在一个VMA地址空间
距离addr最近并且VMA的结束地址大于addr的一个VMA

vma查找.png

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
    struct rb_node *rb_node;
    struct vm_area_struct *vma;

    mmap_assert_locked(mm);
    /* Check the cache first. */
    vma = vmacache_find(mm, addr);
    if (likely(vma))
        return vma;

    rb_node = mm->mm_rb.rb_node;

    while (rb_node) {
        struct vm_area_struct *tmp;

        tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);

        if (tmp->vm_end > addr) {
            vma = tmp;
            if (tmp->vm_start <= addr)
                break;
            rb_node = rb_node->rb_left;
        } else
            rb_node = rb_node->rb_right;
    }

    if (vma)
        vmacache_update(addr, vma);
    return vma;
}

查找是先通过vmacache_find函数在VMA缓存中查找，VMA缓存会记录最近访问的VMA指针，每次查到新的VMA将更新缓存，这样可以加快查找速度。缓存中查不到时获取mm_struct中VMA红黑树的根节点，在红黑树中遍历查找VMM。

内核中还有另外一个查找函数find_vma_intersection函数，这个函数可以查找和给定地址区域由重合的VMA。find_vma_prev在返回满足地址VMA的同时，还返回前一个VMA。

struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                         unsigned long start_addr,
                         unsigned long end_addr)
{
    struct vm_area_struct *vma = find_vma(mm, start_addr);

    if (vma && end_addr <= vma->vm_start)
        vma = NULL;
    return vma;
}

/*
 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
            struct vm_area_struct **pprev)
{
    struct vm_area_struct *vma;

    vma = find_vma(mm, addr);
    if (vma) {
        *pprev = vma->vm_prev;
    } else {
        struct rb_node *rb_node = rb_last(&mm->mm_rb);

        *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
    }
    return vma;
}

插入VMA

按地址大小插入VMA链表时通过insert_vm_struct函数实现的。

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
    struct vm_area_struct *prev;
    struct rb_node **rb_link, *rb_parent;

    if (find_vma_links(mm, vma->vm_start, vma->vm_end,
               &prev, &rb_link, &rb_parent))
        return -ENOMEM;
    if ((vma->vm_flags & VM_ACCOUNT) &&
         security_vm_enough_memory_mm(mm, vma_pages(vma)))
        return -ENOMEM;

    /*
     * The vm_pgoff of a purely anonymous vma should be irrelevant
     * until its first write fault, when page's anon_vma and index
     * are set.  But now set the vm_pgoff it will almost certainly
     * end up with (unless mremap moves it elsewhere before that
     * first wfault), so /proc/pid/maps tells a consistent story.
     *
     * By setting it to reflect the virtual start address of the
     * vma, merges and splits can happen in a seamless way, just
     * using the existing file pgoff checks and manipulations.
     * Similarly in do_mmap and in do_brk_flags.
     */
    if (vma_is_anonymous(vma)) {
        BUG_ON(vma->anon_vma);
        vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
    }

    vma_link(mm, vma, prev, rb_link, rb_parent);
    return 0;
}

find_vma_links主要工作是从小地址开始查找一个足够容纳新vma结构的地址空间，如果发现现存的vma和新的地址重合则返回错误，否测返回新地址空间前一个vma的链表信息和红黑树信息。
设置了VM_ACCOUNT标志后调用security_vm_enough_memory_mm函数产看是否进程由足够的内存可用，大多数用户空间映射都设置了VM_ACCOUNT标志。
如果是匿名映射将vm_pgoff指向区域首地址从0开始的偏移页
vma_link将新VMA插入量表和红黑树，如果是文件映射则还要将VMA加入到address_space中的i_mapping基数树（Radix Tree）中。

合并VMA

在新的 VMA 被加入到进程的地址空间时，内核会检查它是否可以与一个或多个现存的 VMA 进行合并。vma_merge()函数实现将一个新的 VMA 和附近的 VMA 合并功能。合并要满足很多条件，以此代码里做了大量的效验工作。

struct vm_area_struct *vma_merge(struct mm_struct *mm,
            struct vm_area_struct *prev, unsigned long addr,
            unsigned long end, unsigned long vm_flags,
            struct anon_vma *anon_vma, struct file *file,
            pgoff_t pgoff, struct mempolicy *policy,
            struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
            struct anon_vma_name *anon_name)
{
    pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
    struct vm_area_struct *area, *next;
    int err;

    /*
     * We later require that vma->vm_flags == vm_flags,
     * so this tests vma->vm_flags & VM_SPECIAL, too.
     */
    if (vm_flags & VM_SPECIAL)
        return NULL;

    next = vma_next(mm, prev);
    area = next;
    if (area && area->vm_end == end)        /* cases 6, 7, 8 */
        next = next->vm_next;

    /* verify some invariant that must be enforced by the caller */
    VM_WARN_ON(prev && addr <= prev->vm_start);
    VM_WARN_ON(area && end > area->vm_end);
    VM_WARN_ON(addr >= end);

    /*
     * Can it merge with the predecessor?
     */
    if (prev && prev->vm_end == addr &&
            mpol_equal(vma_policy(prev), policy) &&
            can_vma_merge_after(prev, vm_flags,
                        anon_vma, file, pgoff,
                        vm_userfaultfd_ctx, anon_name)) {
        /*
         * OK, it can.  Can we now merge in the successor as well?
         */
        if (next && end == next->vm_start &&
                mpol_equal(policy, vma_policy(next)) &&
                can_vma_merge_before(next, vm_flags,
                             anon_vma, file,
                             pgoff+pglen,
                             vm_userfaultfd_ctx, anon_name) &&
                is_mergeable_anon_vma(prev->anon_vma,
                              next->anon_vma, NULL)) {
                            /* cases 1, 6 */
            err = __vma_adjust(prev, prev->vm_start,
                     next->vm_end, prev->vm_pgoff, NULL,
                     prev);
        } else                  /* cases 2, 5, 7 */
            err = __vma_adjust(prev, prev->vm_start,
                     end, prev->vm_pgoff, NULL, prev);
        if (err)
            return NULL;
        khugepaged_enter_vma_merge(prev, vm_flags);
        return prev;
    }

    /*
     * Can this new request be merged in front of next?
     */
    if (next && end == next->vm_start &&
            mpol_equal(policy, vma_policy(next)) &&
            can_vma_merge_before(next, vm_flags,
                         anon_vma, file, pgoff+pglen,
                         vm_userfaultfd_ctx, anon_name)) {
        if (prev && addr < prev->vm_end)    /* case 4 */
            err = __vma_adjust(prev, prev->vm_start,
                     addr, prev->vm_pgoff, NULL, next);
        else {                  /* cases 3, 8 */
            err = __vma_adjust(area, addr, next->vm_end,
                     next->vm_pgoff - pglen, NULL, next);
            /*
             * In case 3 area is already equal to next and
             * this is a noop, but in case 8 "area" has
             * been removed and next was expanded over it.
             */
            area = next;
        }
        if (err)
            return NULL;
        khugepaged_enter_vma_merge(area, vm_flags);
        return area;
    }

    return NULL;
}

vma_merge()函数实现示意图.png

本文中部分图片参考来及互联网在此标识感谢！