sys_mmap
static inline long do_mmap2( unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { int error = -EBADF; struct file * file = NULL; flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); if (!(flags & MAP_ANONYMOUS)) { file = fget(fd); if (!file) goto out; } down(¤t->mm->mmap_sem); error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); up(¤t->mm->mmap_sem); if (file) fput(file); out: return error; }
1) MAP_ANONYMOUS 这个flag表示没有文件映射,只是用来在指定的地址上分配内存。 2) file = fget(fd); 获取进程中的file结构。 3) do_mmap_pgoff
do_mmap_pgoff 映射文件
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff) { struct mm_struct * mm = current->mm; struct vm_area_struct * vma; int correct_wcount = 0; int error; if (flags & MAP_FIXED) { if (addr & ~PAGE_MASK) return -EINVAL; } else { addr = get_unmapped_area(addr, len); if (!addr) return -ENOMEM; } vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) return -ENOMEM; vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags(prot,flags) | mm->def_flags; vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; vma->vm_ops = NULL; vma->vm_pgoff = pgoff; vma->vm_file = NULL; vma->vm_private_data = NULL; error = -ENOMEM; if (do_munmap(mm, addr, len)) goto free_vma; if ((mm->total_vm << PAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) goto free_vma; if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && !(flags & MAP_NORESERVE) && !vm_enough_memory(len >> PAGE_SHIFT)) goto free_vma; if (file) { if (vma->vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) goto free_vma; correct_wcount = 1; } vma->vm_file = file; get_file(file); error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; } else if (flags & MAP_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } flags = vma->vm_flags; addr = vma->vm_start; insert_vm_struct(mm, vma); if (correct_wcount) atomic_inc(&file->f_dentry->d_inode->i_writecount); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); } return addr; unmap_and_free_vma: if (correct_wcount) atomic_inc(&file->f_dentry->d_inode->i_writecount); vma->vm_file = NULL; fput(file); flush_cache_range(mm, vma->vm_start, vma->vm_end); zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); flush_tlb_range(mm, vma->vm_start, vma->vm_end); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; }
1) if (flags & MAP_FIXED) MAP_FIXED: 表示映射文件到进程空间的起始地址必须是addr,如果满足不了则返回错误。 2) if (addr & ~PAGE_MASK) addr必须要page对齐。 3) addr = get_unmapped_area(addr, len); 如果MAP_FIXED没有设置,则从进程的地址空间中分配一个addr。 4) vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 分配一个vma,每次mmap都会增加一个vma。而brk会判断新增的vma和已有的vma相邻而且属性相同会进行合并。 5) vma->vm_pgoff = pgoff; 记录文件的偏移量到vma中,以供缺页中断时读取文件。 6) if (do_munmap(mm, addr, len)) 解除已经映射的addr。当MAP_FIXED没有指定了,并且addr和之前的map重复了。 7) goto free_vma; TODO 内核中有很多都是先分配资源,然后进一步检查条件,如果检查失败,则释放资源。 之所以采用这种看似浪费的操作,是因为分配资源会导致进程切换。当先检查成功后,再分配资源,就在分配资源过程中发生了进程切换,会导致先前检查的条件已经不成立了。 8) vma->vm_file = file; 设置file 9) error = file->f_op->mmap(file, vma); file->f_op->mmap 和具体的文件系统相关,ext2中对应的是 generic_file_mmap。 10) insert_vm_struct(mm, vma); 把新的vma插入到当前进程的mm中。 11) if (flags & VM_LOCKED) 如果设置了VM_LOCKED标记,表示把文件的内容锁在内存中,此时调用 make_pages_present,把文件读进内存。
get_unmapped_area 用户进程中分配虚拟地址区间
unsigned long get_unmapped_area(unsigned long addr, unsigned long len) { struct vm_area_struct * vmm; if (len > TASK_SIZE) return 0; if (!addr) addr = TASK_UNMAPPED_BASE; addr = PAGE_ALIGN(addr); for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { if (TASK_SIZE - len < addr) return 0; if (!vmm || addr + len <= vmm->vm_start) return addr; addr = vmm->vm_end; } }
1) addr = TASK_UNMAPPED_BASE; 如果addr为0,则从TASK_SIZE/3=1G的位置开始往上找。也就是说,mmap是从1G开始的。 2) for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) 从第一个vma->start大于addr开始找。 3) if (!vmm || addr + len <= vmm->vm_start) 如果当前addr+len < vma->vm_start,找到了一个空洞。
generic_file_mmap ext2的mmap
mmap的定义
struct file_operations ext2_file_operations = { llseek: ext2_file_lseek, read: generic_file_read, write: generic_file_write, ioctl: ext2_ioctl, mmap: generic_file_mmap, open: ext2_open_file, release: ext2_release_file, fsync: ext2_sync_file, };
generic_file_mmap
int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { struct vm_operations_struct * ops; struct inode *inode = file->f_dentry->d_inode; ops = &file_private_mmap; if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { if (!inode->i_mapping->a_ops->writepage) return -EINVAL; ops = &file_shared_mmap; } if (!inode->i_sb || !S_ISREG(inode->i_mode)) return -EACCES; if (!inode->i_mapping->a_ops->readpage) return -ENOEXEC; UPDATE_ATIME(inode); vma->vm_ops = ops; return 0; }
1) vma->vm_ops = ops; 设置vm_ops,这是缺页的回调函数。 2) ops = &file_private_mmap; static struct vm_operations_struct file_private_mmap = { nopage: filemap_nopage, }; 3) if (!inode->i_mapping->a_ops->writepage) 检查 address_space_operations。 4) address_space TODO struct address_space { struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ unsigned long nrpages; /* number of total pages */ struct address_space_operations *a_ops; /* methods */ struct inode *host; /* owner: inode, block_device */ struct vm_area_struct *i_mmap; /* list of private mappings */ struct vm_area_struct *i_mmap_shared; /* list of shared mappings */ spinlock_t i_shared_lock; /* and spinlock protecting it */ }; 5) address_space_operations struct address_space_operations ext2_aops = { readpage: ext2_readpage, writepage: ext2_writepage, sync_page: block_sync_page, prepare_write: ext2_prepare_write, commit_write: generic_commit_write, bmap: ext2_bmap };
make_pages_present 主动触发缺页
int make_pages_present(unsigned long addr, unsigned long end) { int write; struct mm_struct *mm = current->mm; struct vm_area_struct * vma; vma = find_vma(mm, addr); write = (vma->vm_flags & VM_WRITE) != 0; if (addr >= end) BUG(); do { if (handle_mm_fault(mm, vma, addr, write) < 0) return -1; addr += PAGE_SIZE; } while (addr < end); return 0; }
1) 每隔一个 PAGE_SIZE 调用一次 handle_mm_fault。
handle_mm_fault -> handle_pte_fault -> do_no_page 主动触发缺页
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t *page_table) { struct page * new_page; pte_t entry; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, write_access, address); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access); if (new_page == NULL) /* no page was available -- SIGBUS */ return 0; if (new_page == NOPAGE_OOM) return -1; ++mm->rss; flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); } else if (page_count(new_page) > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); set_pte(page_table, entry); update_mmu_cache(vma, address, entry); return 2; /* Major fault */ }
1) new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access); 调用mmap的缺页回调。
vma->vm_ops->nopage
vm_ops在 generic_file_mmap中已经设置好了。 ext2文件系统的设置:
ops = &file_private_mmap; static struct vm_operations_struct file_private_mmap = { nopage: filemap_nopage, };
所以 nopage最终会进入 filemap_nopage。
# filemap_nopage 缺页处理
struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share) { int error; struct file *file = area->vm_file; struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; struct page *page, **hash, *old_page; unsigned long size, pgoff; pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; retry_all: size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if ((pgoff >= size) && (area->vm_mm == current->mm)) return NULL; hash = page_hash(mapping, pgoff); retry_find: page = __find_get_page(mapping, pgoff, hash); if (!page) goto no_cached_page; if (!Page_Uptodate(page)) goto page_not_uptodate; success: if (VM_SequentialReadHint(area)) nopage_sequential_readahead(area, pgoff, size); old_page = page; if (no_share) { struct page *new_page = page_cache_alloc(); if (new_page) { copy_user_highpage(new_page, old_page, address); flush_page_to_ram(new_page); } else new_page = NOPAGE_OOM; page_cache_release(page); return new_page; } flush_page_to_ram(old_page); return old_page; no_cached_page: if ((pgoff < size) && !VM_RandomReadHint(area)) error = read_cluster_nonblocking(file, pgoff, size); else error = page_cache_read(file, pgoff); if (error >= 0) goto retry_find; if (error == -ENOMEM) return NOPAGE_OOM; return NULL; page_not_uptodate: lock_page(page); if (!page->mapping) { UnlockPage(page); page_cache_release(page); goto retry_all; } if (Page_Uptodate(page)) { UnlockPage(page); goto success; } if (!mapping->a_ops->readpage(file, page)) { wait_on_page(page); if (Page_Uptodate(page)) goto success; } lock_page(page); if (!page->mapping) { UnlockPage(page); page_cache_release(page); goto retry_all; } if (Page_Uptodate(page)) { UnlockPage(page); goto success; } ClearPageError(page); if (!mapping->a_ops->readpage(file, page)) { wait_on_page(page); if (Page_Uptodate(page)) goto success; } page_cache_release(page); return NULL; }
1) hash = page_hash(mapping, pgoff); page = __find_get_page(mapping, pgoff, hash); 首先在全局的page_hash_table里尝试搜索pgoff的页面。 2) if (!Page_Uptodate(page)) 如果找到了,检查页面的内容是否是最新的。 3) error = read_cluster_nonblocking(file, pgoff, size); 如果页面不再hash里面,则分配新的物理页,并从设备上读入。把物理页加入相应的队列中。 这个函数会向前预读一些页面。
## page_cache_read 从文件读内容到一个页面
static inline int page_cache_read(struct file * file, unsigned long offset) { struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; struct page **hash = page_hash(mapping, offset); struct page *page; spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash); spin_unlock(&pagecache_lock); if (page) return 0; page = page_cache_alloc(); if (!page) return -ENOMEM; if (!add_to_page_cache_unique(page, mapping, offset, hash)) { int error = mapping->a_ops->readpage(file, page); page_cache_release(page); return error; } page_cache_free(page); return 0; }
1) page = __find_page_nolock(mapping, offset, *hash); 再次到hash表中看看是否已经有别人把这个页面读进来了。 2) if (!add_to_page_cache_unique(page, mapping, offset, hash)) 添加页面到3个链表中。 3) int error = mapping->a_ops->readpage(file, page); 调用相应文件系统的接口读入内容。
### add_to_page_cache_unique
static int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long offset, struct page **hash) { int err; struct page *alias; spin_lock(&pagecache_lock); alias = __find_page_nolock(mapping, offset, *hash); err = 1; if (!alias) { __add_to_page_cache(page,mapping,offset,hash); err = 0; } spin_unlock(&pagecache_lock); return err; } static inline void __add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long offset, struct page **hash) { unsigned long flags; if (PageLocked(page)) BUG(); flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_locked); page_cache_get(page); page->index = offset; add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, hash); lru_cache_add(page); }
1) add_page_to_inode_queue(mapping, page); 添加物理页到 i_mapping的clean_pages中。 2) add_page_to_hash_queue(page, hash); 添加物理页到hash表中。 3) lru_cache_add(page); 添加物理页到 active_list中