在前面我们讲解了kmalloc申请连续物理内存的操作,以及原理和基础cache . 在内核中还有另外一个接口函数那就是vmalloc,申请一片连续的虚拟地址空间,但不保证物理空间连续,实际上我们会想到用户空间的 malloc,malloc它是标准的glibc封装的一个函数,最终实现是通过系统调用brk和mmap来 实现,以后在分析它的实现过程. 它就是申请连续的虚拟空间,但是不保证物理内存的连续,当然用户程序也不怎么关心这个问题,只所以会关心物理内存的连续性一般是由于设备驱动的使用,或者 DMA. 但是vmalloc申请效率比较低,还会造成TLB抖动. 一般内核里常用kmalloc. 除非特殊需求,比如要获取大块内存时,实例就是当ko模块加载到内核运行时,即需要vmalloc.
释放函数:vfree
参考内核 3.8.13
这里是说32位的处理器,即最大寻址4G虚拟空间,(当然现在已经64位比较普及了,后续补上吧)而虚拟地址到物理地址的转化往往需要硬件的支持才能提高效率,即MMU。
当然前提需要os先建立页表PT. 在linux内核,这4G空间并不是完全给用户空间使用在高端0xC0000000 (3G开始)留给内核空间使用(x86默认配置,默认0-16M(DMA),16M-896M(Normal),896M-1G(128M)作为高端内存 分配区域),当然这个区域也是可是配置的.).
kmalloc函数返回的是虚拟地址(线性地址). kmalloc特殊之处在于它分配的内存是物理上连续的,这对于要进行DMA的设备十分重要. 而用vmalloc分配的内存只是线性地址连续,物理地址不一定连续,不能直接用于DMA。我们可以参考一个图:(它是arm 32架构的内核虚拟地址分配图)
下面我们就看看vmalloc函数:(mm/vmalloc.c)
-
-
-
-
-
-
-
-
-
- void *vmalloc(unsigned long size)
- {
- return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
- }
这里我们只用关注size即可,而vmalloc优先从高端内存分配,并且可以睡眠.
继续:
- static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
- {
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
- }
重点看一下__vmalloc_node:
-
-
-
-
-
-
-
-
-
-
-
-
-
- static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller)
- {
- return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, node, caller);
- }
因为这里提到了VMALLOC_START和VMALLOC_END它们究竟是什么值呢?
这里看了arm32和mips32的(根据架构虚拟地址分配不同而不同,比如mips就比较特殊):
在arch/mips/include/asm/pgtable-32.h中
首先看mips虚拟地址分布图:
从这个图里我们知道用户空间为2G(0x0-0x7fff ffff),dma或者normal内存映射在kseg0(512M)/kseg1,而对于vmalloc申请的虚拟地址在kseg2中,当然还有其他一些特殊的映射比如io等.
- #define VMALLOC_START MAP_BASE
-
- #define PKMAP_BASE (0xfe000000UL)
-
- #ifdef CONFIG_HIGHMEM
- # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
- #else
- # define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
- #endif
在arch/arm/include/asm/pgtable.h
-
-
-
-
-
-
-
-
- #define VMALLOC_OFFSET (8*1024*1024)
- #define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
- #define VMALLOC_END 0xff000000UL
在看一个图:
我们知道物理内存简单分为三个区域:ZONE_NORMAL、ZONE_DMA、ZONE_HIGHMEM
vmalloc我们看到它是默认从ZONE_HIGMEM里申请,但是这两个函数虚拟地址是保持一致的,即都占用了4G地址空间的内核虚拟地址.通过上面的图,我们确定了虚拟地址从哪里分配,以及对于的物理空间从哪里分配。
下面看看 vmalloc核心实现:
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- void *__vmalloc_node_range(unsigned long size, unsigned long align,
- unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, int node, const void *caller)
- {
- struct vm_struct *area;
- void *addr;
- unsigned long real_size = size;
-
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages)
- goto fail;
-
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
-
- start, end, node, gfp_mask, caller);
- if (!area)
- goto fail;
-
- addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
-
- if (!addr)
- return NULL;
-
-
-
-
-
- insert_vmalloc_vmlist(area);
-
-
-
-
-
-
- kmemleak_alloc(addr, real_size, 3, gfp_mask);
-
- return addr;
-
- fail:
- warn_alloc_failed(gfp_mask, 0,
- "vmalloc: allocation failure: %lu bytes\n",
- real_size);
- return NULL;
- }
它的基本实现思路很简单:
1. 分配虚拟地址空间
2.对虚拟地址空间进行页表映射
需要熟知 下面两个结构体:
struct vmap_area
- struct vmap_area {
- unsigned long va_start;
- unsigned long va_end;
- unsigned long flags;
- struct rb_node rb_node;
- struct list_head list;
- struct list_head purge_list;
- struct vm_struct *vm;
- struct rcu_head rcu_head;
- };
vm_struct *area :
- struct vm_struct {
- struct vm_struct *next;
- void *addr;
- unsigned long size;
- unsigned long flags;
- struct page **pages;
- unsigned int nr_pages;
- phys_addr_t phys_addr;
- const void *caller;
- };
这里在说明一下vmalloc_init的初始化.
-
-
-
- static void __init mm_init(void)
- {
-
-
-
-
- page_cgroup_init_flatmem();
- mem_init();
- kmem_cache_init();
- percpu_init_late();
- pgtable_cache_init();
- vmalloc_init();
- }
其实在讲slab机制的时候已经说过。
- void __init vmalloc_init(void)
- {
- struct vmap_area *va;
- struct vm_struct *tmp;
- int i;
-
- for_each_possible_cpu(i) {
- struct vmap_block_queue *vbq;
-
- vbq = &per_cpu(vmap_block_queue, i);
- spin_lock_init(&vbq->lock);
- INIT_LIST_HEAD(&vbq->free);
- }
-
-
- for (tmp = vmlist; tmp; tmp = tmp->next) {
-
- va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
- va->flags = VM_VM_AREA;
- va->va_start = (unsigned long)tmp->addr;
- va->va_end = va->va_start + tmp->size;
- va->vm = tmp;
- __insert_vmap_area(va);
- }
-
- vmap_area_pcpu_hole = VMALLOC_END;
-
- vmap_initialized = true;
- }
下面就说说__get_vm_area_node函数:
- static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, const void *caller)
- {
- struct vmap_area *va;
- struct vm_struct *area;
-
- BUG_ON(in_interrupt());
- if (flags & VM_IOREMAP) {
- int bit = fls(size);
-
- if (bit > IOREMAP_MAX_ORDER)
- bit = IOREMAP_MAX_ORDER;
- else if (bit < PAGE_SHIFT)
- bit = PAGE_SHIFT;
-
- align = 1ul << bit;
- }
-
- size = PAGE_ALIGN(size);
- if (unlikely(!size))
- return NULL;
-
- area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
- if (unlikely(!area))
- return NULL;
-
-
-
-
- size += PAGE_SIZE;
-
-
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
-
- if (IS_ERR(va)) {
- kfree(area);
- return NULL;
- }
-
-
-
-
-
-
-
-
- if (flags & VM_UNLIST)
- setup_vmalloc_vm(area, va, flags, caller);
- else
- insert_vmalloc_vm(area, va, flags, caller);
-
- return area;
- }
这个函数核心就是alloc_vmap_area,这个很有趣的,之前我们讲到了vmalloc申请的虚拟地址范围,而它只传递了size而已,对于mips,x86,arm会有不同的虚拟空间.
-
-
-
-
- static struct vmap_area *alloc_vmap_area(unsigned long size,
- unsigned long align,
- unsigned long vstart, unsigned long vend,
- int node, gfp_t gfp_mask)
- {
- struct vmap_area *va;
- struct rb_node *n;
- unsigned long addr;
- int purged = 0;
- struct vmap_area *first;
-
- BUG_ON(!size);
- BUG_ON(size & ~PAGE_MASK);
- BUG_ON(!is_power_of_2(align));
-
- va = kmalloc_node(sizeof(struct vmap_area),
- gfp_mask & GFP_RECLAIM_MASK, node);
- if (unlikely(!va))
- return ERR_PTR(-ENOMEM);
-
- retry:
- spin_lock(&vmap_area_lock);
-
-
-
-
-
-
-
-
-
- if (!free_vmap_cache ||
-
-
- size < cached_hole_size ||
- vstart < cached_vstart ||
- align < cached_align) {
- nocache:
- cached_hole_size = 0;
- free_vmap_cache = NULL;
- }
-
- cached_vstart = vstart;
- cached_align = align;
-
-
- if (free_vmap_cache) {
-
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- addr = ALIGN(first->va_end, align);
- if (addr < vstart)
- goto nocache;
- if (addr + size - 1 < addr)
- goto overflow;
-
- } else {
- addr = ALIGN(vstart, align);
- if (addr + size - 1 < addr)
- goto overflow;
-
- n = vmap_area_root.rb_node;
-
- first = NULL;
-
- while (n) {
-
- struct vmap_area *tmp;
- tmp = rb_entry(n, struct vmap_area, rb_node);
-
- if (tmp->va_end >= addr) {
- first = tmp;
- if (tmp->va_start <= addr)
- break;
- n = n->rb_left;
- } else
- n = n->rb_right;
- }
-
- if (!first)
- goto found;
- }
-
-
- while (addr + size > first->va_start && addr + size <= vend) {
-
- if (addr + cached_hole_size < first->va_start)
- cached_hole_size = first->va_start - addr;
- addr = ALIGN(first->va_end, align);
- if (addr + size - 1 < addr)
- goto overflow;
-
- if (list_is_last(&first->list, &vmap_area_list))
-
- goto found;
-
- first = list_entry(first->list.next,
- struct vmap_area, list);
- }
-
- found:
- if (addr + size > vend)
- goto overflow;
-
- va->va_start = addr;
- va->va_end = addr + size;
- va->flags = 0;
- __insert_vmap_area(va);
-
- free_vmap_cache = &va->rb_node;
-
- spin_unlock(&vmap_area_lock);
-
- BUG_ON(va->va_start & (align-1));
- BUG_ON(va->va_start < vstart);
- BUG_ON(va->va_end > vend);
-
- return va;
-
- overflow:
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = 1;
- goto retry;
- }
- if (printk_ratelimit())
- printk(KERN_WARNING
- "vmap allocation for size %lu failed: "
- "use vmalloc= to increase size.\n", size);
- kfree(va);
- return ERR_PTR(-EBUSY);
- }
既然我们已经开辟了虚拟地址空间,那么还需要做的当然是和页面一一映射起来.
看函数__vmalloc_area_node:
- static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node, const void *caller)
- {
- const int order = 0;
- struct page **pages;
- unsigned int nr_pages, array_size, i;
- gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-
- nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
- array_size = (nr_pages * sizeof(struct page *));
-
- area->nr_pages = nr_pages;
-
- if (array_size > PAGE_SIZE) {
-
- pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
- PAGE_KERNEL, node, caller);
- area->flags |= VM_VPAGES;
- } else {
- pages = kmalloc_node(array_size, nested_gfp, node);
-
- }
- area->pages = pages;
- area->caller = caller;
- if (!area->pages) {
- remove_vm_area(area->addr);
- kfree(area);
- return NULL;
- }
-
- for (i = 0; i < area->nr_pages; i++) {
-
- struct page *page;
- gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
-
- if (node < 0)
- page = alloc_page(tmp_mask);
- else
- page = alloc_pages_node(node, tmp_mask, order);
-
- if (unlikely(!page)) {
-
- area->nr_pages = i;
- goto fail;
- }
- area->pages[i] = page;
- }
-
- if (map_vm_area(area, prot, &pages))
-
- goto fail;
- return area->addr;
-
- fail:
- warn_alloc_failed(gfp_mask, order,
- "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
- (area->nr_pages*PAGE_SIZE), area->size);
- vfree(area->addr);
- return NULL;
- }
而insert_vmalloc_vmlist很明显把vm_struct插入到vmlist。
那么就完成了整个过程,没有想象的复杂,当然对内存有了更多的认识,这里还需要说一下,一般情况下有高端内存会比没有的好些,防止了vmalloc申请的时候造成的TLB抖动等问题,更少的破坏normal空间。
可以通过proc来查看vmalloc的一下信息:
- cat /proc/vmallocinfo
- 0xc0002000-0xc0045000 274432 jffs2_zlib_init+0x24/0xa4 pages=66 vmalloc
- 0xc0045000-0xc0051000 49152 jffs2_zlib_init+0x40/0xa4 pages=11 vmalloc
- 0xc0051000-0xc0053000 8192 brcmnand_create_cet+0x244/0x788 pages=1 vmalloc
- 0xc0053000-0xc0055000 8192 ebt_register_table+0x98/0x39c pages=1 vmalloc
还有:
- # cat /proc/vmstat
- #cat /proc/meminfo
(linuxDOS) |