linux内存管理之vmalloc

TAG: 内存管理内存

在前面我们讲解了kmalloc申请连续物理内存的操作，以及原理和基础cache . 在内核中还有另外一个接口函数那就是vmalloc，申请一片连续的虚拟地址空间，但不保证物理空间连续，实际上我们会想到用户空间的 malloc，malloc它是标准的glibc封装的一个函数，最终实现是通过系统调用brk和mmap来实现，以后在分析它的实现过程. 它就是申请连续的虚拟空间，但是不保证物理内存的连续，当然用户程序也不怎么关心这个问题，只所以会关心物理内存的连续性一般是由于设备驱动的使用，或者 DMA. 但是vmalloc申请效率比较低，还会造成TLB抖动. 一般内核里常用kmalloc. 除非特殊需求，比如要获取大块内存时，实例就是当ko模块加载到内核运行时，即需要vmalloc.
释放函数：vfree
参考内核 3.8.13
这里是说32位的处理器，即最大寻址4G虚拟空间，（当然现在已经64位比较普及了，后续补上吧）而虚拟地址到物理地址的转化往往需要硬件的支持才能提高效率，即MMU。
当然前提需要os先建立页表PT. 在linux内核，这4G空间并不是完全给用户空间使用在高端0xC0000000 （3G开始）留给内核空间使用（x86默认配置，默认0-16M（DMA），16M-896M（Normal），896M-1G（128M）作为高端内存分配区域），当然这个区域也是可是配置的.）.
kmalloc函数返回的是虚拟地址(线性地址). kmalloc特殊之处在于它分配的内存是物理上连续的,这对于要进行DMA的设备十分重要. 而用vmalloc分配的内存只是线性地址连续,物理地址不一定连续,不能直接用于DMA。我们可以参考一个图：（它是arm 32架构的内核虚拟地址分配图）

下面我们就看看vmalloc函数：(mm/vmalloc.c)

/** 
 *    vmalloc - allocate virtually contiguous memory 
 *    @size:        allocation size 
 *    Allocate enough pages to cover @size from the page level 
 *    allocator and map them into contiguous kernel virtual space. 
 * 
 *    For tight control over page level allocator and protection flags 
 *    use __vmalloc() instead. 
 */ 
void *vmalloc(unsigned long size) 
{ 
    return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); 
}

这里我们只用关注size即可，而vmalloc优先从高端内存分配，并且可以睡眠.
继续：

static inline void *__vmalloc_node_flags(unsigned long size, 
                    int node, gfp_t flags) 
{ 
    return __vmalloc_node(size, 1, flags, PAGE_KERNEL, 
                    node, __builtin_return_address(0)); 
}

重点看一下__vmalloc_node:

/** 
 *    __vmalloc_node - allocate virtually contiguous memory 
 *    @size:        allocation size 
 *    @align:        desired alignment 
 *    @gfp_mask:    flags for the page level allocator 
 *    @prot:        protection mask for the allocated pages 
 *    @node:        node to use for allocation or -1 
 *    @caller:    caller's return address 
 * 
 *    Allocate enough pages to cover @size from the page level 
 *    allocator with @gfp_mask flags. Map them into contiguous 
 *    kernel virtual space, using a pagetable protection of @prot. 
 */ 
static void *__vmalloc_node(unsigned long size, unsigned long align, 
             gfp_t gfp_mask, pgprot_t prot, 
             int node, const void *caller) 
{ 
    return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 
                gfp_mask, prot, node, caller); 
}

因为这里提到了VMALLOC_START和ＶＭＡＬＬＯＣ＿ＥＮＤ它们究竟是什么值呢？
这里看了arm32和mips32的（根据架构虚拟地址分配不同而不同，比如mips就比较特殊）：
在arch/mips/include/asm/pgtable-32.h中
首先看mips虚拟地址分布图：

从这个图里我们知道用户空间为2G（0x0-0x7fff ffff）,dma或者normal内存映射在kseg0（512M）/kseg1,而对于vmalloc申请的虚拟地址在kseg2中，当然还有其他一些特殊的映射比如io等.

#define VMALLOC_START MAP_BASE 
 
#define PKMAP_BASE        (0xfe000000UL) 
 
#ifdef CONFIG_HIGHMEM 
# define VMALLOC_END    (PKMAP_BASE-2*PAGE_SIZE) 
#else 
# define VMALLOC_END    (FIXADDR_START-2*PAGE_SIZE) 
#endif

在arch/arm/include/asm/pgtable.h

/* 
 * Just any arbitrary offset to the start of the vmalloc VM area: the 
 * current 8MB value just means that there will be a 8MB "hole" after the 
 * physical memory until the kernel virtual memory starts. That means that 
 * any out-of-bounds memory accesses will hopefully be caught. 
 * The vmalloc() routines leaves a hole of 4kB between each vmalloced 
 * area for the same reason. ;) 
 */ 
#define VMALLOC_OFFSET        (8*1024*1024) 
#define VMALLOC_START        (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) 
#define VMALLOC_END        0xff000000UL

在看一个图：

我们知道物理内存简单分为三个区域：ZONE_NORMAL、ZONE_DMA、ZONE_HIGHMEM
vmalloc我们看到它是默认从ZONE_HIGMEM里申请，但是这两个函数虚拟地址是保持一致的，即都占用了4G地址空间的内核虚拟地址.通过上面的图，我们确定了虚拟地址从哪里分配，以及对于的物理空间从哪里分配。
下面看看 vmalloc核心实现：

/** 
 *    __vmalloc_node_range - allocate virtually contiguous memory 
 *    @size:        allocation size 
 *    @align:        desired alignment 
 *    @start:        vm area range start 
 *    @end:        vm area range end 
 *    @gfp_mask:    flags for the page level allocator 
 *    @prot:        protection mask for the allocated pages 
 *    @node:        node to use for allocation or -1 
 *    @caller:    caller's return address 
 * 
 *    Allocate enough pages to cover @size from the page level 
 *    allocator with @gfp_mask flags. Map them into contiguous 
 *    kernel virtual space, using a pagetable protection of @prot. 
 */ 
void *__vmalloc_node_range(unsigned long size, unsigned long align, 
            unsigned long start, unsigned long end, gfp_t gfp_mask, 
            pgprot_t prot, int node, const void *caller) 
{ 
    struct vm_struct *area; 
    void *addr; 
    unsigned long real_size = size; 
 
    size = PAGE_ALIGN(size); 
    if (!size || (size >> PAGE_SHIFT) > totalram_pages) 
        goto fail; 
 
    area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
    // 分配虚拟地址空间 把vm_struct 和vm_area(红黑树机制)关联起来. 
                 start, end, node, gfp_mask, caller); 
    if (!area) 
        goto fail; 
 
    addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
  //计算需要申请的页面，申请page，然后修改页表完成映射. 
    if (!addr) 
        return NULL; 
 
    /* 
     * In this function, newly allocated vm_struct is not added 
     * to vmlist at __get_vm_area_node(). so, it is added here. 
     */ 
    insert_vmalloc_vmlist(area);     //把vm_struct插入 全局vmlist链表 
 
    /* 
     * A ref_count = 3 is needed because the vm_struct and vmap_area 
     * structures allocated in the __get_vm_area_node() function contain 
     * references to the virtual address of the vmalloc'ed block. 
     */ 
    kmemleak_alloc(addr, real_size, 3, gfp_mask);    //内存泄露追踪 
 
    return addr; 
 
fail: 
    warn_alloc_failed(gfp_mask, 0, 
             "vmalloc: allocation failure: %lu bytes\n", 
             real_size); 
    return NULL; 
}

它的基本实现思路很简单：
1. 分配虚拟地址空间
2.对虚拟地址空间进行页表映射

需要熟知下面两个结构体：
struct vmap_area

struct vmap_area { 
    unsigned long va_start; 
    unsigned long va_end; 
    unsigned long flags; 
    struct rb_node rb_node;        /* address sorted rbtree */ 
    struct list_head list;        /* address sorted list */ 
    struct list_head purge_list;    /* "lazy purge" list */ 
    struct vm_struct *vm; 
    struct rcu_head rcu_head; 
};

vm_struct *area ：

struct vm_struct { 
    struct vm_struct    *next; 
    void            *addr; 
    unsigned long        size; 
    unsigned long        flags; 
    struct page        **pages; 
    unsigned int        nr_pages; 
    phys_addr_t        phys_addr; 
    const void        *caller; 
};

这里在说明一下vmalloc_init的初始化.

/* 
 * Set up kernel memory allocators 
 */ 
static void __init mm_init(void) 
{ 
    /* 
     * page_cgroup requires contiguous pages, 
     * bigger than MAX_ORDER unless SPARSEMEM. 
     */ 
    page_cgroup_init_flatmem(); 
    mem_init(); 
    kmem_cache_init(); 
    percpu_init_late(); 
    pgtable_cache_init(); 
    vmalloc_init(); 
}

其实在讲slab机制的时候已经说过。

void __init vmalloc_init(void) 
{ 
    struct vmap_area *va; 
    struct vm_struct *tmp; 
    int i; 
 
    for_each_possible_cpu(i) { 
        struct vmap_block_queue *vbq; 
 
        vbq = &per_cpu(vmap_block_queue, i); 
        spin_lock_init(&vbq->lock); 
        INIT_LIST_HEAD(&vbq->free); 
    } 
 
    /* Import existing vmlist entries. */ 
    for (tmp = vmlist; tmp; tmp = tmp->next) { 
 // 在系统启动或者初始化之初，vmlist为空. 
        va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 
        va->flags = VM_VM_AREA; 
        va->va_start = (unsigned long)tmp->addr; 
        va->va_end = va->va_start + tmp->size; 
        va->vm = tmp; 
        __insert_vmap_area(va); 
    } 
 
    vmap_area_pcpu_hole = VMALLOC_END; 
 
    vmap_initialized = true; 
}

下面就说说__get_vm_area_node函数：

static struct vm_struct *__get_vm_area_node(unsigned long size, 
        unsigned long align, unsigned long flags, unsigned long start, 
        unsigned long end, int node, gfp_t gfp_mask, const void *caller) 
{ 
    struct vmap_area *va; 
    struct vm_struct *area; 
 
    BUG_ON(in_interrupt()); 
    if (flags & VM_IOREMAP) { // ioremap标志，映射的是设备内存 
        int bit = fls(size); 
 
        if (bit > IOREMAP_MAX_ORDER) 
            bit = IOREMAP_MAX_ORDER; 
        else if (bit < PAGE_SHIFT) 
            bit = PAGE_SHIFT; 
 
        align = 1ul << bit; 
    } 
 
    size = PAGE_ALIGN(size); 
    if (unlikely(!size)) 
        return NULL; 
 
    area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 
    if (unlikely(!area)) 
        return NULL; 
 
    /* 
     * We always allocate a guard page. 
     */ 
    size += PAGE_SIZE;
 // 多偏移一页，为了防止访问越界，由于多出来的一页并不映射，所以当访问的时候，会引发保护异常. 
 
    va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
// 申请vm_area虚拟地址空间 
    if (IS_ERR(va)) { 
        kfree(area); 
        return NULL; 
    } 
 
    /* 
     * When this function is called from __vmalloc_node_range, 
     * we do not add vm_struct to vmlist here to avoid 
     * accessing uninitialized members of vm_struct such as 
     * pages and nr_pages fields. They will be set later. 
     * To distinguish it from others, we use a VM_UNLIST flag. 
     */ 
    if (flags & VM_UNLIST)   // 必然走这里  
        setup_vmalloc_vm(area, va, flags, caller);  // 关联vm_struct 和 vm_area 
    else 
        insert_vmalloc_vm(area, va, flags, caller); 
 
    return area; 
}

这个函数核心就是alloc_vmap_area，这个很有趣的，之前我们讲到了vmalloc申请的虚拟地址范围，而它只传递了size而已，对于mips，x86，arm会有不同的虚拟空间.

/* 
 * Allocate a region of KVA of the specified size and alignment, within the 
 * vstart and vend. 
 */ 
static struct vmap_area *alloc_vmap_area(unsigned long size, 
                unsigned long align, 
                unsigned long vstart, unsigned long vend, 
                int node, gfp_t gfp_mask) 
{ 
    struct vmap_area *va; 
    struct rb_node *n; 
    unsigned long addr; 
    int purged = 0; 
    struct vmap_area *first; 
 
    BUG_ON(!size); 
    BUG_ON(size & ~PAGE_MASK); 
    BUG_ON(!is_power_of_2(align)); 
 
    va = kmalloc_node(sizeof(struct vmap_area), 
            gfp_mask & GFP_RECLAIM_MASK, node); 
    if (unlikely(!va)) 
        return ERR_PTR(-ENOMEM); 
 
retry: 
    spin_lock(&vmap_area_lock); 
    /* 
     * Invalidate cache if we have more permissive parameters. 
     * cached_hole_size notes the largest hole noticed _below_ 
     * the vmap_area cached in free_vmap_cache: if size fits 
     * into that hole, we want to scan from vstart to reuse 
     * the hole instead of allocating above free_vmap_cache. 
     * Note that __free_vmap_area may update free_vmap_cache 
     * without updating cached_hole_size or cached_align. 
     */ 
    if (!free_vmap_cache ||   
  //第一次调用的时候 free_vmap_cache为空，后来即后边的
//代码line 105 ： free_vmap_cache = &va->rb_node; 一般不为空 ；一般会发  
// 生align < cached_align的情况,即会清除free_vmap_cache。
//有时候align比较大的时候，它会跳过一段虚拟地址空间.后面的申请由于没  
//有free_vmap_cache，所以它需要重新查询 
            size < cached_hole_size || 
            vstart < cached_vstart || 
            align < cached_align) { 
nocache: 
        cached_hole_size = 0; 
        free_vmap_cache = NULL; 
    } 
    /* record if we encounter less permissive parameters */ 
    cached_vstart = vstart; 
    cached_align = align; 
 
    /* find starting point for our search */ 
    if (free_vmap_cache) {  
// 第一次使用的时候为空；当不为空时，它保持上次申请的节点，并初始化addr为va_end. 
        first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); 
        addr = ALIGN(first->va_end, align); 
        if (addr < vstart) 
            goto nocache; 
        if (addr + size - 1 < addr) 
            goto overflow; 
 
    } else { 
        addr = ALIGN(vstart, align); 
        if (addr + size - 1 < addr) 
            goto overflow; 
 
        n = vmap_area_root.rb_node; 
// 同样vmap_area_root.rb_node; 初始化也为空，第一次使用为空 
        first = NULL; 
 
        while (n) { 
// 当不是第一申请，并且free_cache为空的时候， 需要重新找到根节点即va_start <= addr 
            struct vmap_area *tmp; 
            tmp = rb_entry(n, struct vmap_area, rb_node); 
      
            if (tmp->va_end >= addr) { 
                first = tmp; 
                if (tmp->va_start <= addr) 
                    break; 
                n = n->rb_left; 
            } else 
                n = n->rb_right; 
        } 
 
        if (!first) 
            goto found; 
    } 
 
    /* from the starting point, walk areas until a suitable hole is found */ 
    while (addr + size > first->va_start && addr + size <= vend) {
// 当不是第一申请，并且free_cache为空的时候,查询红黑树节点，找到合适的空间地址. 
        if (addr + cached_hole_size < first->va_start) 
            cached_hole_size = first->va_start - addr; 
        addr = ALIGN(first->va_end, align); 
        if (addr + size - 1 < addr) 
            goto overflow; 
          
        if (list_is_last(&first->list, &vmap_area_list))
 // 默认不会在这里操作。也就是说它没有元素. 
            goto found; 
 
        first = list_entry(first->list.next, 
                struct vmap_area, list); 
    } 
 
found: 
    if (addr + size > vend) 
        goto overflow; 
 
    va->va_start = addr; 
    va->va_end = addr + size; 
    va->flags = 0; 
    __insert_vmap_area(va);  
 // 添加到红黑树 vmap_area_root 
    free_vmap_cache = &va->rb_node; 
// 初始化free_vmap_cache ，它会影响后续虚拟空间的申请. 
    spin_unlock(&vmap_area_lock); 
 
    BUG_ON(va->va_start & (align-1)); 
    BUG_ON(va->va_start < vstart); 
    BUG_ON(va->va_end > vend); 
 
    return va; 
 
overflow: 
    spin_unlock(&vmap_area_lock); 
    if (!purged) { 
        purge_vmap_area_lazy(); 
        purged = 1; 
        goto retry; 
    } 
    if (printk_ratelimit()) 
        printk(KERN_WARNING 
            "vmap allocation for size %lu failed: " 
            "use vmalloc= to increase size.\n", size); 
    kfree(va); 
    return ERR_PTR(-EBUSY); 
}

既然我们已经开辟了虚拟地址空间，那么还需要做的当然是和页面一一映射起来.
看函数__vmalloc_area_node：

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 
                 pgprot_t prot, int node, const void *caller) 
{ 
    const int order = 0; 
    struct page **pages; 
    unsigned int nr_pages, array_size, i; 
    gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 
 
    nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; //申请多少pages 
    array_size = (nr_pages * sizeof(struct page *));   //需要多大的存放page指针的空间 . 
 
    area->nr_pages = nr_pages; 
    /* Please note that the recursion is strictly bounded. */ 
    if (array_size > PAGE_SIZE) {
 // 这里默认page_size 为4k 即4096 ，地址32位的话，相当于申请1024个pages：4M空间 
        pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 
                PAGE_KERNEL, node, caller); 
        area->flags |= VM_VPAGES; 
    } else { 
        pages = kmalloc_node(array_size, nested_gfp, node);
// 小于一页，则直接利用slab机制申请物理空间地址 给pages. 
    } 
    area->pages = pages; 
    area->caller = caller; 
    if (!area->pages) { 
        remove_vm_area(area->addr); 
        kfree(area); 
        return NULL; 
    } 
 
    for (i = 0; i < area->nr_pages; i++) { 
//  每次申请一个page利用alloc_page直接申请物理页面 
        struct page *page; 
        gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; 
 
        if (node < 0) 
            page = alloc_page(tmp_mask); 
        else 
            page = alloc_pages_node(node, tmp_mask, order); 
 
        if (unlikely(!page)) { 
            /* Successfully allocated i pages, free them in __vunmap() */ 
            area->nr_pages = i; 
            goto fail; 
        } 
        area->pages[i] = page;             // 分配的地址存放在指针数组. 
    } 
 
    if (map_vm_area(area, prot, &pages))
 // 修改页表 ,一页一页的实现映射，以及flush cache保持数据的一致性；对页面映射和操作感兴趣的可以深入看看这个函数. 
        goto fail; 
    return area->addr; 
 
fail: 
    warn_alloc_failed(gfp_mask, order, 
             "vmalloc: allocation failure, allocated %ld of %ld bytes\n", 
             (area->nr_pages*PAGE_SIZE), area->size); 
    vfree(area->addr); 
    return NULL; 
}

而insert_vmalloc_vmlist很明显把vm_struct插入到vmlist。
那么就完成了整个过程，没有想象的复杂，当然对内存有了更多的认识，这里还需要说一下，一般情况下有高端内存会比没有的好些，防止了vmalloc申请的时候造成的TLB抖动等问题，更少的破坏normal空间。
可以通过proc来查看vmalloc的一下信息：

cat /proc/vmallocinfo 
0xc0002000-0xc0045000 274432 jffs2_zlib_init+0x24/0xa4 pages=66 vmalloc 
0xc0045000-0xc0051000 49152 jffs2_zlib_init+0x40/0xa4 pages=11 vmalloc 
0xc0051000-0xc0053000 8192 brcmnand_create_cet+0x244/0x788 pages=1 vmalloc 
0xc0053000-0xc0055000 8192 ebt_register_table+0x98/0x39c pages=1 vmalloc

还有：

# cat /proc/vmstat 
#cat /proc/meminfo

(linuxDOS)

Linux内存管理：Malloc	Linux 内存使用方法详细解析
老生综合详谈C语言关键字、内存分配、数据存储	内存不足引起的SIGKILL：一个缓冲区不断增长问题的定位与解决
linux ipc——shared memory	用文件映射（File Mapping）实现进程间内存共享
Linux内存管理	Linux IPC：共享内存
内存块分配算法和抖动问题	Linux-mmap函数介绍

搜索

热门标签:

linux内存管理之vmalloc