库函数malloc在linux内核的实现是范例的匿名映射,关于匿名映射可以参考前面的缺页非常处理的文章,IPC办法中的共享内存也是匿名映射,绝大多数的mmap运用处所是文件映射,而内核中的处理,对付malloc的处理函数是do_brk,这是由于malloc操作的是进程地址空间的堆段,而函数do_brk便是针对堆段的处理,mmap包括IPC的共享内存都是由函数do_mmap处理;不论哪种处理实在操作的都是进程地址空间的线性区;

下面首先看下函数do_mmap,源码如下:

static inline unsigned long do_mmap(struct file file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset){ unsigned long ret = -EINVAL; if ((offset + PAGE_ALIGN(len)) < offset) goto out; if (!(offset & ~PAGE_MASK)) ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);out: return ret;}

更多linux内核视频教程文档资料免费领取后台私信【内核】自行获取。

phpyacmmapenomemmallocmmap在Linux内核中的处置函数 RESTful API

它的核心函数是do_mmap_pgoff,这里紧张关注下do_mmap的参数情形:

file: 如果新的线性区将要把一个文件映射到内存,则要用文件描述符file和文件偏移offset,如不须要,则file和offset不考虑都为空;addr: 指定从哪里开始查找空闲区间,一样平常都是NULL即由内核指定;len: 哀求的线性地址空间长度;prot: 指定线性区下的页的访问权限;flag: 指定线性区域的其他标志;

初步有个印象即可,接下来关注函数do_mmap_pgoff,源码如下:

unsigned long do_mmap_pgoff(struct file file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff){ /当提高程的mm/ struct mm_struct mm = current->mm; struct inode inode; unsigned int vm_flags; int error; unsigned long reqprot = prot; / Does the application expect PROT_READ to imply PROT_EXEC? (the exception is when the underlying filesystem is noexec mounted, in which case we dont add PROT_EXEC.) / /是否隐蔽了可实行属性/ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) prot |= PROT_EXEC; if (!len) return -EINVAL; /判断输入的欲映射的起始地址是否小于最小映射地址,如果小于,将addr修正为最小地址,不过条件是MAP_FIXED旗标没有设置/ if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); / Careful about overflows.. / /检测len是否为0/ len = PAGE_ALIGN(len); if (!len) return -ENOMEM; / offset overflow? / /再次检测是否越界/ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; / Too many mappings? / /在一个进程中对付mmap个数是有限定的。
超出了还是nomem的缺点/ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; / Obtain the address to map to. we verify (or select) it and ensure that it represents a valid section of the address space. //创建新的vma区域之前先要探求一块足够大小的空闲区域,本函数便是用于查找没有映射过的空洞内存区,返回值addr便是这段空洞的起始地址/ addr = get_unmapped_area(file, addr, len, pgoff, flags); if (addr & ~PAGE_MASK) return addr; / Do simple checking here so the lower-level routines won't have to. we assume access permissions have been handled by the open of the memory object, so we don't do any here. / /设置vm_flags,根据传入的port和flags以及mm本身自有的旗标来设置/ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /下面两个if是关于锁定的内存的内容,暂不关注/ if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; / mlock MCL_FUTURE? / if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; } /判断是文件映射还是匿名映射,如果是文件映射则赋值inode/ inode = file ? file->f_path.dentry->d_inode : NULL;/对vm_flags进行设置,由参数flags确定vma线性区的flags,是共享还是私有/ /文件映射/ if (file) { switch (flags & MAP_TYPE) { /共享/ case MAP_SHARED: if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) return -EACCES; / Make sure we don't allow writing to an append-only file.. / if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; / Make sure there are no mandatory locks on the file. / if (locks_verify_locked(inode)) return -EAGAIN; vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); / fall through / /私有/ case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; } if (!file->f_op || !file->f_op->mmap) return -ENODEV; break; default: return -EINVAL; } } /匿名映射/ else { switch (flags & MAP_TYPE) { /共享,对应共享内存/ case MAP_SHARED: / Ignore pgoff. / pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; /私有/ case MAP_PRIVATE: / Set pgoff according to addr for anon_vma. / pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } error = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (error) return error; error = ima_file_mmap(file, prot); if (error) return error; /实际创建vma/ return mmap_region(file, addr, len, flags, vm_flags, pgoff);}
这个函数由三部分组成:

1、 找到能否创建符合哀求的vma,该当在哪里创建?

这部分紧张通过函数get_unmapped_area实现,我们须要一段虚拟空间,范围是[addr,addr+len],用户进程一样平常不会指定addr(对应flags含义标志MAP_FIXED的情形),也便是由内核指定这个虚拟空间的首地址addr在哪里,在函数do_mmap_pgoff调用get_unmapped_area之前会预指定addr,通过调用函数round_hint_to_min实现,按我的理解这个预指定的值是宏CONFIG_DEFAULT_MMAP_MIN_ADDR 的值为4096(个人认为初始值是多少并不主要,由于后面会不断地找得当的值),然后用这个预指定的addr为参数调用函数get_unmapped_area,源码如下:

unsigned longget_unmapped_area(struct file file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags){unsigned long (get_area)(struct file , unsigned long, unsigned long, unsigned long, unsigned long);unsigned long error = arch_mmap_check(addr, len, flags);if (error) return error;/ Careful about overflows.. /if (len > TASK_SIZE) return -ENOMEM; /这是内存描述符的get_unmapped_area方法,可能是arch_get_unmapped_area,也可能是arch_get_unmapped_area_topdown,用于找到得当的空闲区间可容纳[addr,addr+len],空闲区间便是指这个区间当前还没有vma/get_area = current->mm->get_unmapped_area; /这是文件的get_unmapped_area方法,一样平常来说驱动不用自己实现get_unmapped_area方法,它只需实现mmap方法,映射特定的物理内存/if (file && file->f_op && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; /会调用以上两种中的任意一种函数,终极得到得当的创建新vma的起始地址addr/addr = get_area(file, addr, len, pgoff, flags);if (IS_ERR_VALUE(addr)) return addr;if (addr > TASK_SIZE - len) return -ENOMEM;if (addr & ~PAGE_MASK) return -EINVAL; /便是返回addr/return arch_rebalance_pgtables(addr, len);}

可见,函数get_unmapped_area实际是通过函数指针get_area实现,get_area有两种可能,如果是文件映射,并且该文件的file_operation定义了get_unmapped_area方法,那么利用它的get_unmapped_area方法实现定位虚拟区间,但我估计这样用的做法很少,以mmap利用较多的设备驱动来讲,多数设备驱动文件的file_operation没有定义get_unmapped_area方法,由于没有必要;以是一样平常都是用另一种方法,利用mm的get_unmapped_area方法,对付arm它是函数arch_get_unmapped_area,源码如下:

unsigned longarch_get_unmapped_area(struct file filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags){struct mm_struct mm = current->mm;struct vm_area_struct vma;unsigned long start_addr;#ifdef CONFIG_CPU_V6unsigned int cache_type;int do_align = 0, aliasing = 0;/ We only need to do colour alignment if either the I or D caches alias. This is indicated by bits 9 and 21 of the cache type register. /cache_type = read_cpuid_cachetype();if (cache_type != read_cpuid_id()) { aliasing = (cache_type | cache_type >> 12) & (1 << 11); if (aliasing) do_align = filp || flags & MAP_SHARED;}#else#define do_align 0#define aliasing 0#endif/ We enforce the MAP_FIXED case. //一样平常来说flags不会是MAP_FIXED,这解释在mmap调用时就指定了详细的虚拟地址addr,当然如果真的如此,也便是确实用户进程指定了虚拟地址addr,那么也就不用通过探求空闲的vma再去找其起始地址addr/if (flags & MAP_FIXED) { if (aliasing && flags & MAP_SHARED && (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)) return -EINVAL; return addr;} /哀求长度不可大于一个进程的最大地址空间长度,为3G-16MB(16MB是module占用的)/if (len > TASK_SIZE) return -ENOMEM;if (addr) { /do_align为0,故addr按页对齐/ if (do_align) addr = COLOUR_ALIGN(addr, pgoff); else addr = PAGE_ALIGN(addr); /找到本mm中的第一个vm_end大于addr的线性区/ vma = find_vma(mm, addr); /要么是没有找到vm_end大于addr的线性区,即vma = NULL, 要么是找到这样的线性区,但addr+len <= 该线性区的起始, 这解释这个addr及其len是适宜创建新的线性区vma的,它不会滋扰现有vma/ if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr;} /上面的if不堪利,就得重新选取起始位置addr了: 1、所需长度len比当提高程mm的最大空洞还大,设置搜索位置addr(及start_addr变量)为mm的free_area_cache/if (len > mm->cached_hole_size) { start_addr = addr = mm->free_area_cache;} /2、所需长度小于当提高程mm的最大空洞,从用户空间的1/3位置开始(对付arm,便是3G/3 = 1G即0x40000000),并置mm的空洞为0,这是要重新开始仔细查找了,mm的空洞将在查找过程中随时更新/ else { start_addr = addr = TASK_UNMAPPED_BASE; mm->cached_hole_size = 0;}full_search: /确保addr页对齐/if (do_align) addr = COLOUR_ALIGN(addr, pgoff);else addr = PAGE_ALIGN(addr); /从addr开始,遍历包含它或在它之后的本进程空间的线性区vma/for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { / At this point: (!vma || addr < vma->vm_end). / /如果立时就要超出用户空间了的话:/ if (TASK_SIZE - len < addr) { / Start a new search - just in case we missed some holes. / /利用临时变量start_addr,如果上次不是从用户空间1/3开始遍历的(前面的else),现在改为1/3重新遍历并置mm的空间为0,重新遍历/ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = addr = TASK_UNMAPPED_BASE; mm->cached_hole_size = 0; goto full_search; } /这解释从1/3遍历也弗成,只能报错返回了!/ return -ENOMEM; } /找到得当的区间了,将利用这部分空间,还要更新mm的空闲线性区搜索位置(free_area_cache),所谓得当的区间,便是说这个区间起始地址与addr之间间隔大于哀求的长度len,以是在addr可以创建新vma/ if (!vma || addr + len <= vma->vm_start) { / Remember the place where we stopped the search: / mm->free_area_cache = addr + len; return addr; } /还是没有找到得当的空闲区间,连续往前遍历(addr = vma->vm_end),把稳须要及时更新mm的最大空洞大小,便是一旦创造有更大的空洞时/ if (addr + mm->cached_hole_size < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; /addr每次都在这次vma的结尾处,等待下次遍历时比较空洞是否够长/ addr = vma->vm_end; if (do_align) addr = COLOUR_ALIGN(addr, pgoff);}}

首先直接碰尝尝看,看addr后面的vma(如果存在的话)与addr的间隔是否够长,即大于长度len,如果可以的话,就直接返回addr即可;

每每不会这么随意马虎成功,这就将进入循环查找流程,即标号full_search的部分,这里看到了mm的free_area_cache和cached_hole_size的用途了,它们便是标识从当前正在从哪里查找以及当提高程的vma之间最大空洞是多大,循环查找的过程便是让addr不断的蹦到一个vma的结尾处,把它与下一个vma的开始处的间隔,和须要的长度len比较,当创造一个比len大的空洞时,即创造了可以用来创建新vma的地方了,返回addr,mm的free_area_cache便是标识每次是从哪里查找的,cached_hole_size的用途是不断更新为创造的最大空洞的值,这就利于今后再创建新vma;正常情形下返回的是找到的得当的addr;

回到函数get_unmapped_area,它将返回这个addr给函数do_mmap_pgoff;

2、 然后是确定vma线性区的flags,针对文件映射和匿名映射有所不同;

3、末了是实际创建新vma线性区,通过函数mmap_region实现,源码如下:

unsigned long mmap_region(struct file file, unsigned long addr, unsigned long len, unsigned long flags, unsigned int vm_flags, unsigned long pgoff){struct mm_struct mm = current->mm;struct vm_area_struct vma, prev;int correct_wcount = 0;int error;struct rb_node rb_link, rb_parent;unsigned long charged = 0;struct inode inode = file ? file->f_path.dentry->d_inode : NULL;/ Clear old maps /error = -ENOMEM;munmap_back:/函数find_vma_prepare()与find_vma()基本相同,它扫描当提高程地址空间的vm_area_struct构造所形成的红黑树,试图找到包含addr的vma线性区;如果找到了,解释addr所在的虚拟区已经在利用,也便是已经有映射存在,因此要调用do_munmap()把这个老的虚拟区从进程地址空间中撤销,如果撤销不堪利,就返回一个负数;如果撤销成功,就连续查找,直到在红黑树中找不到addr所在的虚拟区/vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); /解释这个vma与要新建的vma有重叠/if (vma && vma->vm_start < addr + len) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back;}/ Check against address space limit. / /检讨映射页数是否超过映射限定/if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM;/ Set 'VM_NORESERVE' if we should not account for the memory use of this mapping. /if ((flags & MAP_NORESERVE)) { / We honor MAP_NORESERVE if allowed to overcommit / if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; / hugetlb applies strict overcommit unless MAP_NORESERVE / if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE;}/ Private writable mapping: check memory availability /if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT;}/ Can we just expand an old mapping? //检讨前一个线性区是否可以包含新的线性区,须要它的vm_flags与新线性区是一样的, 还会试图把新线性区前面的线性区及后面的线性区合并 一旦可以,立即跳到标号out,即不用新创建vma线性区 这是linux对付线性区管理的一个原则,目的是为了: 1、只管即便减少vma的个数,减少从slab获取的内存 2、只管即便减少线性区和线性区之间的空洞/vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);if (vma) goto out;/ Determine the object being mapped and call the appropriate specific mapper. the address has already been validated, but not unmapped, but the maps are removed from the list. //无法合并,须要新创建vma,用slab分配一个线性区构造/vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);if (!vma) { error = -ENOMEM; goto unacct_error;} /初始化vma线性区成员/vma->vm_mm = mm;vma->vm_start = addr;vma->vm_end = addr + len;vma->vm_flags = vm_flags;vma->vm_page_prot = vm_get_page_prot(vm_flags);vma->vm_pgoff = pgoff;/如果是映射文件/if (file) { error = -EINVAL; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) goto free_vma; if (vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) goto free_vma; correct_wcount = 1; } vma->vm_file = file; get_file(file); error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; if (vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); / Can addr have changed?? Answer: Yes, several device drivers can do it in their f_op->mmap method. -DaveM / addr = vma->vm_start; pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags;}/如果是共享匿名区(用于IPC的共享内存)/ else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma;}if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);/将vma关联到所有的数据构造中/vma_link(mm, vma, prev, rb_link, rb_parent);file = vma->vm_file;/ Once vma denies write, undo our temporary denial count /if (correct_wcount) atomic_inc(&inode->i_writecount);out:perf_event_mmap(vma);/更新mm的映射长度/mm->total_vm += len >> PAGE_SHIFT;vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); /关于锁定的内存的问题/if (vm_flags & VM_LOCKED) { / makes pages present; downgrades, drops, reacquires mmap_sem / long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); if (nr_pages < 0) return nr_pages; / vma gone! / mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len);return addr;unmap_and_free_vma:if (correct_wcount) atomic_inc(&inode->i_writecount);vma->vm_file = NULL;fput(file);/ Undo any partial mapping done by a device driver. /unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);charged = 0;free_vma:kmem_cache_free(vm_area_cachep, vma);unacct_error:if (charged) vm_unacct_memory(charged);return error;}

这个函数包括两部分内容:

1、 去除滋扰:调用函数find_vma_prepare查找是否已有vma线性区包含addr了,如果有,调用函数do_munmap把这个vma干掉,函数find_vma_prepare的事理是:所有vma通过红黑树存储起来,通过当前红黑树把查找是否有包含addr的vma线性区,查找事理是利用红黑树的特性,后续会有关于红黑树的专题;

2、 创建映射:要把稳,linux不肯望vma和vma之间总有空洞,只要要新创建的vma的flags属性和它前面的或后面的vma相同,那么就可以合并成一个新的vma,这样做一来减少vma的个数,也就减少从slab获取的物理内存,二来减少虚拟空间的空洞的摧残浪费蹂躏;如果无法合并,那么也只好新建vma并对vma构造体初始化干系成员;根据vma是否有页锁定标志(VM_LOCKED),决定是否立即分配物理页;

3、 末了把该vma起始地址即addr返回;

终极do_mmap的返回值便是addr!

可见do_mmap完成的便是在本进程地址空间找到一段得当的虚拟地址空间,并把起始地址返回给用户进程,并未映射物理页(除非用户进程哀求vma页锁定),这部分留给用户对其访问时产生的缺页非常处理。

下面再看函数do_brk,事实上do_brk和do_mmap险些一样,由于它们的实质都是一样的;这里多说一下它由mm/mmap.c文件的系统调用SYSCALL_DEFINE1调用,系统调用的问题如前几篇文章一样,后续会有专题谈论它,这里直接看SYSCALL_DEFINE1函数源码:

SYSCALL_DEFINE1(brk, unsigned long, brk){ unsigned long rlim, retval; unsigned long newbrk, oldbrk; struct mm_struct mm = current->mm; unsigned long min_brk; down_write(&mm->mmap_sem);#ifdef CONFIG_COMPAT_BRK min_brk = mm->end_code;#else min_brk = mm->start_brk;#endif if (brk < min_brk) goto out; / Check against rlimit here. If this check is done later after the test of oldbrk with newbrk then it can escape the test and let the data segment grow beyond its set limit the in case where the limit is not page aligned -Ram Gupta / /资源rlim不能超限,且堆段和数据段大小之和不能大于资源rlim/ rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + (mm->end_data - mm->start_data) > rlim) goto out; /newbrk由参数brk对齐得到,代表希望的堆地址 oldbrk由当前堆地址mm->brk得到,代表现在的堆地址/ newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) goto set_brk; / Always allow shrinking brk. / /如果新边界好比今的边界要小,那解释要实行紧缩操作,即缩短堆,成功后跳到set_brk/ if (brk <= mm->brk) { if (!do_munmap(mm, newbrk, oldbrk-newbrk)) goto set_brk; goto out; } / Check against existing mmap mappings. / /想要扩大的地方已经有vma了 从实质来讲,调用do_brk也便是为了创建所需的vma,既然现在不用创建就已经有vma了,那么就可以返回了/ /这个函数用于探求是否存在和希望的范围[start_addr,end_addr]有交集的vma 找到返回该vma,否则返回NULL 这里找到就解释没法增加我们的vma了,已经有vma占住位置,跳到标号out 没找到解释可以增加我们的vma,调用do_brk/ if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) goto out; / Ok, looks good - let it rip. / /扩大堆/ if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out;set_brk: mm->brk = brk;out: retval = mm->brk; up_write(&mm->mmap_sem); return retval;}

首先看看函数SYSCALL_DEFINE1的参数,它的参数实际便是unsigned long brk,指定要配置的堆的地址,进程的mm的成员brk指定了当提高程的堆地址,在本函数中oldbrk和newbrk分别是当前的堆地址和希望的堆地址,如果newbrk更大,解释是free操作,即开释堆空间,将调用函数do_munmap;

如果不是开释操作,那便是申请堆空间,首先调用函数find_vma_intersection查看当提高程地址空间是否在brk处已经有vma并且长度足够,如已有则不用再申请可直接返回;否则还需调用函数do_brk扩大堆空间范围,实际上便是连续创建新的vma;

以上便是SYSCALL_DEFINE1的内容,下面看下函数do_brk,和do_mmap非常相似,源码如下:

unsigned long do_brk(unsigned long addr, unsigned long len){ struct mm_struct mm = current->mm; struct vm_area_struct vma, prev; unsigned long flags; struct rb_node rb_link, rb_parent; /pgoff取得这个虚拟地址addr的页号/ pgoff_t pgoff = addr >> PAGE_SHIFT; int error; len = PAGE_ALIGN(len); if (!len) return addr; /检讨给定的地址是否能够进行安全的地址映射,无需特殊关注/ error = security_file_mmap(NULL, 0, 0, 0, addr, 1); if (error) return error; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;/创建新的vma区域之前先要探求一块足够大小(长度为参数len)的空闲区域,本函数便是用于查找没有映射过的内存区,找到后返回这个区间的起始地址addr/ error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (error & ~PAGE_MASK) return error; / mlock MCL_FUTURE? / if (mm->def_flags & VM_LOCKED) { unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; } / mm->mmap_sem is required to protect against another thread changing the mappings in case we sleep. / verify_mm_writelocked(mm); / Clear old maps. this also does some error checking for us / munmap_back:/函数find_vma_prepare()与find_vma()基本相同,它扫描当提高程地址空间的vm_area_struct构造所形成的红黑树,试图找到结束地址高于addr的第一个vma;如果找到,解释addr所在的vma已经在利用,也便是已经有映射存在,会去除这个映射/ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); if (vma && vma->vm_start < addr + len) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; } / Check against address space limits after clearing old maps... / /在实行函数find_vma_prepare后,下面3个if判断再次检讨是否资源超限/ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) return -ENOMEM; if (security_vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; / Can we just expand an old private anonymous mapping? //检讨前一个线性区是否可以包含新的线性区,须要它的vm_flags与新线性区是一样的, 还会试图把新线性区前面的线性区及后面的线性区合并 换句话说,能合并就不创建vma/ vma = vma_merge(mm, prev, addr, addr + len, flags, NULL, NULL, pgoff, NULL); /如果合并成功,那么就无需下面的创建事情了,直接跳到标号out/ if (vma) goto out; / create a vma struct for an anonymous mapping / /如果合并不堪利,则须要创建新的vma,从slab分配vma占用空间,并初始化, 终极加入mm的vma双向链表和vma红黑树/ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent);out:/更新mm的total_vm成员值,当看了proc文件系统内干系内容实现后, 可创造为什么malloc的内存透露是通过这个成员值的变革达到可查的/ mm->total_vm += len >> PAGE_SHIFT; /如果有页面锁定的标志,那么现在就要分配物理页/ if (flags & VM_LOCKED) { if (!mlock_vma_pages_range(vma, addr, addr + len)) mm->locked_vm += (len >> PAGE_SHIFT); } return addr;}

do_brk和do_mmap非常相似,首先确定在哪里适宜创建新的vma,然后去除可能存在的已有vma的滋扰,末了创建(或合并)vma,末了根据vma的flags是否有锁定标志(VM_LOCKED)决定是否立即分配物理页;