用处:需要内核空间的虚拟地址是连续的物理块
void *vmalloc(unsigned long size)
{
return __vmalloc_node_flags(size, NUMA_NO_NODE,
GFP_KERNEL);
}
**__vmalloc_node_flags => __vmalloc_node =>
__vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,gfp_mask, prot, 0, node, caller); **
vmalloc的区间
以内核module区域结束地址为起点
/kernel/msm-4.19/arch/arm64/include/asm/pgtable.h
#define VMALLOC_START (MODULES_END)
#define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
ARM64中,起点为0xFFFF 0000 1000 0000,终点为0xFFFF 7DFF BFFF 0000,整个区域大小为129022GB
1、核心调用接口
/**
* __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size 分配内存大小
* @align: desired alignment 对齐要求
* @start: vm area range start vmalloc区域的起始地址
* @end: vm area range end vmalloc区域的结束地址
* @gfp_mask: flags for the page level allocator 页面分配器的分配掩码,比如GFP_FGIO
* @prot: protection mask for the allocated pages 分配物理页面对应的物理属性,比如PAGE_KERNEL
* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) vmalloc区域的标志位
* @node: node to use for allocation or NUMA_NO_NODE 内存节点
* @caller: caller's return address 调用者的返回地址
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
struct vm_struct *area; //定义个vma区域
void *addr;
unsigned long real_size = size; //保存第一个size
size = PAGE_ALIGN(size); //按page对齐,即使是10个字节,也要分配1个page
if (!size || (size >> PAGE_SHIFT) > totalram_pages) //无法映射,直接fail
goto fail;
/*找到一块空的hole*/
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;
/*分配物理内存*/
addr = __vmalloc_area_node(area, gfp_mask, prot, node);
if (!addr)
return NULL;
/*
* First make sure the mappings are removed from all page-tables
* before they are freed.
*/
vmalloc_sync_unmappings();
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
clear_vm_uninitialized_flag(area);
kmemleak_vmalloc(area, size, gfp_mask);
return addr;
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size);
return NULL;
}
2、分配一个vm_struct
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;//分配一个vm_struct数据结构来描述这个vmalloc区域
/*确保不在中断上下文中睡眠,防止hung机*/
BUG_ON(in_interrupt());
size = PAGE_ALIGN(size); //为啥又要对齐下?
if (unlikely(!size))
return NULL;
/*如果这个区域是用来IOREMAP的,那么默认按128个页面对齐*/
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
/*分配个数据结构,kmalloc填0*/
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
/*如果没有定义这个标志位,就多分配一个页面备用*/
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
/*分配vmalloc区域*/
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) { //如果没有分配成功,就把结构体释放掉
kfree(area);
return NULL;
}
/*通过vmap_area来构建一个vm_struct空间*/
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
3、调用alloc_vmap_area分配vmalloc区域
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va; //描述一个vmalloc区域
unsigned long addr;
int purged = 0;
might_sleep();
/*slab上分配个vmap_area的结构体区间给va*/
va = kmem_cache_alloc_node(vmap_area_cachep,
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
/*
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
retry:
/*正式开始分配,先上个自旋锁,cpu同时只能有一个分配vmalloc*/
spin_lock(&vmap_area_lock);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
* 成功返回分配的区域的首地址,失败就返回vend,如果返回vend说明overflow了
*/
addr = __alloc_vmap_area(size, align, vstart, vend, node);
if (unlikely(addr == vend))
goto overflow;
/*分配的首地址赋值给va_start,加上size再赋值给va_vend*/
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0; //妈的这是啥,结构体不能加个解释吗
/*把分配到的hole插入到红黑树上*/
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
/*放锁*/
spin_unlock(&vmap_area_lock);
return va;
overflow: /*overflow处理*/
spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
goto retry;
}
if (gfpflags_allow_blocking(gfp_mask)) {
unsigned long freed = 0;
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
if (freed > 0) {
purged = 0;
goto retry;
}
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
3.1、__alloc_vmap_area如何分配指定size的hole呢?
调用 va = find_vmap_lowest_match(size, align, vstart);
/*
* Returns a start address of the newly allocated area, if success.
* Otherwise a vend is returned that indicates failure.
* 成功返回首地址,失败返回vmalloc_end
*/
static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend, int node)
{
unsigned long nva_start_addr; //va的首地址
struct vmap_area *va; //va结构体
enum fit_type type;
/*
enum fit_type {
NOTHING_FIT = 0, /* 起始地址<vstart或者起始地址+size>vend */
FL_FIT_TYPE = 1, /* full fit 申请的hole占满了vmalloc区域*/
LE_FIT_TYPE = 2, /* left edge fit 起始地址=vstart,结束地址<vend*/
RE_FIT_TYPE = 3, /* right edge fit 起始地址>vstart,结束地址=vend*/
NE_FIT_TYPE = 4 /* no edge fit 两边都不靠,这个应该最多*/
};*/
int ret; //返回值
/*核心中的核心,找到红黑树上最小地址的free block满足分配需要的条件*/
va = find_vmap_lowest_match(size, align, vstart);
if (unlikely(!va)) //失败直接返回vend
return vend;
/*判断是否是vmalloc区域第一个hole,感觉没有啥必要*/
if (va->va_start > vstart)
nva_start_addr = ALIGN(va->va_start, align);
else
nva_start_addr = ALIGN(vstart, align);
/* Check the "vend" restriction. 确保size是否足够*/
if (nva_start_addr + size > vend)
return vend;
/* Classify what we have found. 确认是否超过区域*/
type = classify_va_fit_type(va, nva_start_addr, size);
if (WARN_ON_ONCE(type == NOTHING_FIT))
return vend;
/* Update the free vmap_area. 更新空闲vmalloc区域,这个没看明白*/
ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
if (ret)
return vend;
return nva_start_addr;
}
3.2、把分配到的hole插入到红黑树上insert_vmap_area
xxxx对不起有空再看,看着头大
4、分配物理内存__vmalloc_area_node
终于返回了一个vm_struct结构体了,现在可以靠着他来申请真正的物理内存!
疑问:vm_struct和vmap_area是啥关系?
struct vmap_area
用于描述一段虚拟地址的区域,从结构体中va_start/va_end
也能看出来。同时该结构体会通过rb_node
挂在红黑树上,通过list
挂在链表上。
struct vmap_area
中vm
字段是struct vm_struct
结构,用于管理虚拟地址和物理页之间的映射关系,可以将struct vm_struct
构成一个链表,维护多段映射。
struct vm_struct {
struct vm_struct *next;
void *addr;
unsigned long size;
unsigned long flags;
struct page **pages;
unsigned int nr_pages;
phys_addr_t phys_addr;
const void *caller;
};
struct vmap_area {
unsigned long va_start;
unsigned long va_end;
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct llist_node purge_list; /* "lazy purge" list */
struct vm_struct *vm;
struct rcu_head rcu_head;
};
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
/*如果没有设置DMA分配掩码,就优先使用高端内存*/
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
/*需要的物理页面的page数,这里会做是否需要VM_NO_GUARD判断,返回的是实际物理页面,不包括
*额外的保护页
*/
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
/*为每个物理页面分配个page指针*/
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded.
* 请注意递归是有严格界限的
* 512K个page???好大啊
* 如果指针数组特别多,就用vmalloc去分配这个指针数组,否则用kmalloc去分配,所以说为啥叫递归
* 就是怕每次给指针数组分配,使用vmalloc,结果结构体的申请又需要vmalloc
*/
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
/*为每个页面单独调用alloc_page接口函数来分配物理页面*/
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask|highmem_mask);
else
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
/*给&nr_vmalloc_pages 加 area->nr_pages*/
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
}
/*指针数组的第i个指向刚分配的page结构体*/
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (map_vm_area(area, prot, pages))
goto fail;
return area->addr;
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
5、map_vm_area建立页面映射
map_vm_area -> vmap_page_range(addr, end, prot, pages);
addr是vm_struct起始地址,end是hole的结束地址,addr+size
/*遍历页表 和 填充对应的页表*/
ret = vmap_page_range_noflush(start, end, prot, pages);
flush_cache_vmap(start, end);
多级页表层层遍历....
vmap_page_range_noflush
vmap_p4d_range
vmap_pud_range
vmap_pmd_range
vmap_pte_range
/*
* Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
* will have pfns corresponding to the "pages" array.
*
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
*/
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
unsigned long addr = start;
int err = 0;
int nr = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
return nr;
}
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_alloc(&init_mm, pgd, addr);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pud_t *pud;
unsigned long next;
pud = pud_alloc(&init_mm, p4d, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_alloc(&init_mm, pud, addr);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pte_t *pte;
/*
* nr is a running index into the array which helps higher level
* callers keep track of where we're up to.
*/
pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
struct page *page = pages[*nr];
if (WARN_ON(!pte_none(*pte)))
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
return 0;
}