内核代码阅读(13) - sys_mmap

简介: sys_mmap

sys_mmap

static inline long do_mmap2(
        unsigned long addr, unsigned long len,
        unsigned long prot, unsigned long flags,
        unsigned long fd, unsigned long pgoff)
    {
        int error = -EBADF;
        struct file * file = NULL;
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
        if (!(flags & MAP_ANONYMOUS)) {
                file = fget(fd);
                if (!file)
                        goto out;
        }
        down(&current->mm->mmap_sem);
        error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        up(&current->mm->mmap_sem);
        if (file)
                fput(file);
    out:
        return error;
    }
1) MAP_ANONYMOUS
   这个flag表示没有文件映射,只是用来在指定的地址上分配内存。
2) file = fget(fd);
   获取进程中的file结构。
3) do_mmap_pgoff

do_mmap_pgoff 映射文件

unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
        unsigned long prot, unsigned long flags, unsigned long pgoff)
{
        struct mm_struct * mm = current->mm;
        struct vm_area_struct * vma;
        int correct_wcount = 0;
        int error;
        if (flags & MAP_FIXED) {
                if (addr & ~PAGE_MASK)
                        return -EINVAL;
        } else {
                addr = get_unmapped_area(addr, len);
                if (!addr)
                        return -ENOMEM;
        }
        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
        if (!vma)
                return -ENOMEM;
        vma->vm_mm = mm;
        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
        vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
        vma->vm_ops = NULL;
        vma->vm_pgoff = pgoff;
        vma->vm_file = NULL;
        vma->vm_private_data = NULL;
        error = -ENOMEM;
        if (do_munmap(mm, addr, len))
                goto free_vma;
        if ((mm->total_vm << PAGE_SHIFT) + len
            > current->rlim[RLIMIT_AS].rlim_cur)
                goto free_vma;
        if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
            !(flags & MAP_NORESERVE)                                 &&
            !vm_enough_memory(len >> PAGE_SHIFT))
                goto free_vma;
        if (file) {
                if (vma->vm_flags & VM_DENYWRITE) {
                        error = deny_write_access(file);
                        if (error)
                                goto free_vma;
                        correct_wcount = 1;
                }
                vma->vm_file = file;
                get_file(file);
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
        } else if (flags & MAP_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
        }
        flags = vma->vm_flags;
        addr = vma->vm_start;
        insert_vm_struct(mm, vma);
        if (correct_wcount)
                atomic_inc(&file->f_dentry->d_inode->i_writecount);
        mm->total_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED) {
                mm->locked_vm += len >> PAGE_SHIFT;
                make_pages_present(addr, addr + len);
        }
        return addr;
    unmap_and_free_vma:
        if (correct_wcount)
                atomic_inc(&file->f_dentry->d_inode->i_writecount);
        vma->vm_file = NULL;
        fput(file);
        flush_cache_range(mm, vma->vm_start, vma->vm_end);
        zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
        flush_tlb_range(mm, vma->vm_start, vma->vm_end);
    free_vma:
        kmem_cache_free(vm_area_cachep, vma);
        return error;
    }
1) if (flags & MAP_FIXED)
   MAP_FIXED: 表示映射文件到进程空间的起始地址必须是addr,如果满足不了则返回错误。
2) if (addr & ~PAGE_MASK)
   addr必须要page对齐。
3) addr = get_unmapped_area(addr, len);
   如果MAP_FIXED没有设置,则从进程的地址空间中分配一个addr。
4) vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
   分配一个vma,每次mmap都会增加一个vma。而brk会判断新增的vma和已有的vma相邻而且属性相同会进行合并。
5) vma->vm_pgoff = pgoff;
   记录文件的偏移量到vma中,以供缺页中断时读取文件。
6) if (do_munmap(mm, addr, len))
   解除已经映射的addr。当MAP_FIXED没有指定了,并且addr和之前的map重复了。
7) goto free_vma;
   TODO
   内核中有很多都是先分配资源,然后进一步检查条件,如果检查失败,则释放资源。
   之所以采用这种看似浪费的操作,是因为分配资源会导致进程切换。当先检查成功后,再分配资源,就在分配资源过程中发生了进程切换,会导致先前检查的条件已经不成立了。
8) vma->vm_file = file;
   设置file
9) error = file->f_op->mmap(file, vma);
   file->f_op->mmap 和具体的文件系统相关,ext2中对应的是 generic_file_mmap。
10) insert_vm_struct(mm, vma);
    把新的vma插入到当前进程的mm中。
11) if (flags & VM_LOCKED)
    如果设置了VM_LOCKED标记,表示把文件的内容锁在内存中,此时调用 make_pages_present,把文件读进内存。

get_unmapped_area 用户进程中分配虚拟地址区间

unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
    {
        struct vm_area_struct * vmm;
        if (len > TASK_SIZE)
                return 0;
        if (!addr)
                addr = TASK_UNMAPPED_BASE;
        addr = PAGE_ALIGN(addr);
        for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
                if (TASK_SIZE - len < addr)
                        return 0;
                if (!vmm || addr + len <= vmm->vm_start)
                        return addr;
                addr = vmm->vm_end;
        }
    }
1) addr = TASK_UNMAPPED_BASE;
   如果addr为0,则从TASK_SIZE/3=1G的位置开始往上找。也就是说,mmap是从1G开始的。
2) for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next)
   从第一个vma->start大于addr开始找。
3) if (!vmm || addr + len <= vmm->vm_start)
   如果当前addr+len < vma->vm_start,找到了一个空洞。

generic_file_mmap ext2的mmap

mmap的定义
struct file_operations ext2_file_operations = {
        llseek:                ext2_file_lseek,
        read:                generic_file_read,
        write:                generic_file_write,
        ioctl:                ext2_ioctl,
        mmap:                generic_file_mmap,
        open:                ext2_open_file,
        release:        ext2_release_file,
        fsync:                ext2_sync_file,
   };
generic_file_mmap
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
    {
        struct vm_operations_struct * ops;
        struct inode *inode = file->f_dentry->d_inode;
        ops = &file_private_mmap;
        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
                if (!inode->i_mapping->a_ops->writepage)
                        return -EINVAL;
                ops = &file_shared_mmap;
        }
        if (!inode->i_sb || !S_ISREG(inode->i_mode))
                return -EACCES;
        if (!inode->i_mapping->a_ops->readpage)
                return -ENOEXEC;
        UPDATE_ATIME(inode);
        vma->vm_ops = ops;
        return 0;
    }
1) vma->vm_ops = ops;
   设置vm_ops,这是缺页的回调函数。
2) ops = &file_private_mmap;
   static struct vm_operations_struct file_private_mmap = {
       nopage:                filemap_nopage,
   };
3) if (!inode->i_mapping->a_ops->writepage)
   检查 address_space_operations。
4) address_space
   TODO
   struct address_space {
    struct list_head        clean_pages;        /* list of clean pages */
    struct list_head        dirty_pages;        /* list of dirty pages */
    struct list_head        locked_pages;        /* list of locked pages */
    unsigned long                nrpages;        /* number of total pages */
    struct address_space_operations *a_ops;        /* methods */
    struct inode                *host;                /* owner: inode, block_device */
    struct vm_area_struct        *i_mmap;        /* list of private mappings */
    struct vm_area_struct        *i_mmap_shared; /* list of shared mappings */
    spinlock_t                i_shared_lock;  /* and spinlock protecting it */
  };
5) address_space_operations
   struct address_space_operations ext2_aops = {
    readpage: ext2_readpage,
    writepage: ext2_writepage,
    sync_page: block_sync_page,
    prepare_write: ext2_prepare_write,
    commit_write: generic_commit_write,
    bmap: ext2_bmap
   };

make_pages_present 主动触发缺页

int make_pages_present(unsigned long addr, unsigned long end)
    {
        int write;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct * vma;
        vma = find_vma(mm, addr);
        write = (vma->vm_flags & VM_WRITE) != 0;
        if (addr >= end)
                BUG();
        do {
                if (handle_mm_fault(mm, vma, addr, write) < 0)
                        return -1;
                addr += PAGE_SIZE;
        } while (addr < end);
        return 0;
    }
1) 每隔一个 PAGE_SIZE 调用一次 handle_mm_fault。
handle_mm_fault -> handle_pte_fault -> do_no_page 主动触发缺页
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
        unsigned long address, int write_access, pte_t *page_table)
    {
        struct page * new_page;
        pte_t entry;
        if (!vma->vm_ops || !vma->vm_ops->nopage)
                return do_anonymous_page(mm, vma, page_table, write_access, address);
        new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
        if (new_page == NULL)        /* no page was available -- SIGBUS */
                return 0;
        if (new_page == NOPAGE_OOM)
                return -1;
        ++mm->rss;
        flush_page_to_ram(new_page);
        flush_icache_page(vma, new_page);
        entry = mk_pte(new_page, vma->vm_page_prot);
        if (write_access) {
                entry = pte_mkwrite(pte_mkdirty(entry));
        } else if (page_count(new_page) > 1 &&
                   !(vma->vm_flags & VM_SHARED))
                entry = pte_wrprotect(entry);
        set_pte(page_table, entry);
        update_mmu_cache(vma, address, entry);
        return 2;        /* Major fault */
    }
1) new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
   调用mmap的缺页回调。
vma->vm_ops->nopage
vm_ops在 generic_file_mmap中已经设置好了。
   ext2文件系统的设置:
ops = &file_private_mmap;
       static struct vm_operations_struct file_private_mmap = {
           nopage:                filemap_nopage,
       };
所以 nopage最终会进入 filemap_nopage。
# filemap_nopage 缺页处理
struct page * filemap_nopage(struct vm_area_struct * area,
        unsigned long address, int no_share)
    {
        int error;
        struct file *file = area->vm_file;
        struct inode *inode = file->f_dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        struct page *page, **hash, *old_page;
        unsigned long size, pgoff;
        pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
    retry_all:
        size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if ((pgoff >= size) && (area->vm_mm == current->mm))
                return NULL;
        hash = page_hash(mapping, pgoff);
    retry_find:
        page = __find_get_page(mapping, pgoff, hash);
        if (!page)
                goto no_cached_page;
        if (!Page_Uptodate(page))
                goto page_not_uptodate;
    success:
        if (VM_SequentialReadHint(area))
                nopage_sequential_readahead(area, pgoff, size);
        old_page = page;
        if (no_share) {
                struct page *new_page = page_cache_alloc();
                if (new_page) {
                        copy_user_highpage(new_page, old_page, address);
                        flush_page_to_ram(new_page);
                } else
                        new_page = NOPAGE_OOM;
                page_cache_release(page);
                return new_page;
        }
        flush_page_to_ram(old_page);
        return old_page;
    no_cached_page:
        if ((pgoff < size) && !VM_RandomReadHint(area))
                error = read_cluster_nonblocking(file, pgoff, size);
        else
                error = page_cache_read(file, pgoff);
        if (error >= 0)
                goto retry_find;
        if (error == -ENOMEM)
                return NOPAGE_OOM;
        return NULL;
    page_not_uptodate:
        lock_page(page);
        if (!page->mapping) {
                UnlockPage(page);
                page_cache_release(page);
                goto retry_all;
        }
        if (Page_Uptodate(page)) {
                UnlockPage(page);
                goto success;
        }
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page(page);
                if (Page_Uptodate(page))
                        goto success;
        }
        lock_page(page);
        if (!page->mapping) {
                UnlockPage(page);
                page_cache_release(page);
                goto retry_all;
        }
        if (Page_Uptodate(page)) {
                UnlockPage(page);
                goto success;
        }
        ClearPageError(page);
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page(page);
                if (Page_Uptodate(page))
                        goto success;
        }
        page_cache_release(page);
        return NULL;
    }
1) hash = page_hash(mapping, pgoff);
   page = __find_get_page(mapping, pgoff, hash);
   首先在全局的page_hash_table里尝试搜索pgoff的页面。
2) if (!Page_Uptodate(page))
   如果找到了,检查页面的内容是否是最新的。
3) error = read_cluster_nonblocking(file, pgoff, size);
   如果页面不再hash里面,则分配新的物理页,并从设备上读入。把物理页加入相应的队列中。
   这个函数会向前预读一些页面。
## page_cache_read 从文件读内容到一个页面
static inline int page_cache_read(struct file * file, unsigned long offset) 
    {
        struct inode *inode = file->f_dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        struct page **hash = page_hash(mapping, offset);
        struct page *page; 
        spin_lock(&pagecache_lock);
        page = __find_page_nolock(mapping, offset, *hash); 
        spin_unlock(&pagecache_lock);
        if (page)
                return 0;
        page = page_cache_alloc();
        if (!page)
                return -ENOMEM;
        if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
                int error = mapping->a_ops->readpage(file, page);
                page_cache_release(page);
                return error;
        }
        page_cache_free(page);
        return 0;
    }
1) page = __find_page_nolock(mapping, offset, *hash);
   再次到hash表中看看是否已经有别人把这个页面读进来了。
2) if (!add_to_page_cache_unique(page, mapping, offset, hash))
   添加页面到3个链表中。
3) int error = mapping->a_ops->readpage(file, page);
   调用相应文件系统的接口读入内容。
### add_to_page_cache_unique
static int add_to_page_cache_unique(struct page * page,
        struct address_space *mapping, unsigned long offset,
        struct page **hash)
    {
        int err;
        struct page *alias;
        spin_lock(&pagecache_lock);
        alias = __find_page_nolock(mapping, offset, *hash);
        err = 1;
        if (!alias) {
                __add_to_page_cache(page,mapping,offset,hash);
                err = 0;
        }
        spin_unlock(&pagecache_lock);
        return err;
    }
    static inline void __add_to_page_cache(struct page * page,
        struct address_space *mapping, unsigned long offset,
        struct page **hash)
    {
        unsigned long flags;
        if (PageLocked(page))
                BUG();
        flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
        page->flags = flags | (1 << PG_locked);
        page_cache_get(page);
        page->index = offset;
        add_page_to_inode_queue(mapping, page);
        add_page_to_hash_queue(page, hash);
        lru_cache_add(page);
    }
1) add_page_to_inode_queue(mapping, page);
   添加物理页到 i_mapping的clean_pages中。
2) add_page_to_hash_queue(page, hash);
   添加物理页到hash表中。
3) lru_cache_add(page);
   添加物理页到 active_list中
相关文章
|
4月前
|
存储 算法 Linux
【Linux系统编程】Linux 文件系统探究:深入理解 struct dirent、DIR 和 struct stat结构
【Linux系统编程】Linux 文件系统探究:深入理解 struct dirent、DIR 和 struct stat结构
170 0
|
10月前
|
API 开发工具
【Pintos】实现自定义 UserProg 系统调用 | 添加 syscall-nr 系统调用号 | 编写新的参数调用宏
【Pintos】实现自定义 UserProg 系统调用 | 添加 syscall-nr 系统调用号 | 编写新的参数调用宏
97 0
|
4月前
|
缓存 Linux 编译器
C/C++ 函数调用以及Linux中系统调用 开销介绍:介绍C/C函数调用以及Linux中系统调用的开销情况
C/C++ 函数调用以及Linux中系统调用 开销介绍:介绍C/C函数调用以及Linux中系统调用的开销情况
54 0
|
4月前
|
存储 Linux
Linux文件编程(lseek函数和stat函数)
Linux文件编程(lseek函数和stat函数)
76 0
Linux文件编程(lseek函数和stat函数)
|
缓存 Linux API
系统编程之文件IO(七)——0,1,2三个文件描述符与库函数和系统调用的区别
系统编程之文件IO(七)——0,1,2三个文件描述符与库函数和系统调用的区别
102 0
系统编程之文件IO(七)——0,1,2三个文件描述符与库函数和系统调用的区别
|
存储 API
驱动开发:内核R3与R0内存映射拷贝
在上一篇博文`《驱动开发:内核通过PEB得到进程参数》`中我们通过使用`KeStackAttachProcess`附加进程的方式得到了该进程的PEB结构信息,本篇文章同样需要使用进程附加功能,但这次我们将实现一个更加有趣的功能,在某些情况下应用层与内核层需要共享一片内存区域通过这片区域可打通内核与应用层的隔离,此类功能的实现依附于MDL内存映射机制实现。
323 0
驱动开发:内核R3与R0内存映射拷贝
|
Linux
Linux系统编程-进程间通信(mmap内存映射)
前面文章介绍了进程间常用的通信方式: 无名管道和命名管道,这篇文章介绍内存映射,内存映射在多进程访问文件读写的时候非常方便。
242 0