为什么内存管理和IO映射要放一起呢?因为IO映射有memory map io(MMIO)和port map io(PMIO)两种,其中MMIO和内存管理有关的。
MMIO和普通内存的访问的汇编指令是相同的;PMIO有自己的汇编指令。
kvm如果执行到了PMIO的指令,那么退出状态是KVM_EXIT_IO。
kvm怎么知道某段内存是MMIO,从而退出状态是KVM_EXIT_MMIO,而某段内存是普通内存,由kvm处理缺页,而不退出呢?这是本文想要探究的一个东西。
PS:kvm退出状态是KVM_EXIT_IO或者KVM_EXIT_MMIO,那么将由qemu进行读写虚拟设备IO端口的模拟,如何模拟?
PMIO的看《android qemu-kvm i8254 pit虚拟设备》,MMIO的看《android
emulator虚拟设备分析第一篇之battery》。
阅读本文前,需要对MMU,页表,虚拟内存空间,物理内存空间有一些了解。对于kvm的影子页表的工作原理,kvm的vcpu的执行,以及kvm的ioctl有一些了解。
前提知识:
KVM之内存虚拟化:http://royluo.org/2016/03/13/kvm-mmu-virtualization/,重点看GVA,GPA,HVA,HPA,影子页表的东西
kvm api:https://kernel.org/doc/Documentation/virtual/kvm/api.txt,重点看ioctl的东西,并对kvm的使用有所印象
kvm api使用实例:https://lwn.net/Articles/658511/,https://lwn.net/Articles/658512/
QEMU-MEMORY-MANAGEMENT.TXT:https://android.googlesource.com/platform/external/qemu.git/+/master/docs/QEMU-MEMORY-MANAGEMENT.TXT
本文使用的android版本是5.1.0,x86的img;host机器是intel x86_64,ubuntu12.04
先来一个大图:

普通内存的申请(external/qemu/hw/i386/pc.c):
/*
* Allocate a single contiguous RAM so that the goldfish
* framebuffer can work well especially when the frame buffer is
* large.
*/
ram_addr = qemu_ram_alloc(NULL, "pc.ram", below_4g_mem_size);
cpu_register_physical_memory(0, below_4g_mem_size, ram_addr);
由于kvm可以使用硬件提供的EPT影子页表,所以我们只需要将HVA和GPA的关系通过ioctl告知kvm即可(KVM_SET_USER_MEMORY_REGION)。
先介绍一下qemu内存管理中重要的ram_list和RAMBlock
RAMList ram_list = { .blocks = QTAILQ_HEAD_INITIALIZER(ram_list.blocks) };
typedef struct RAMList {
QemuMutex mutex;
unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
RAMBlock *mru_block;
QTAILQ_HEAD(ram, RAMBlock) blocks;
uint32_t version;
} RAMList;
typedef struct RAMBlock {
uint8_t *host;
ram_addr_t offset;
ram_addr_t length;
uint32_t flags;
char idstr[256];
/* Reads can take either the iothread or the ramlist lock.
* Writes must take both locks.
*/
QTAILQ_ENTRY(RAMBlock) next;
int fd;
} RAMBlock;
ram_list有个锁,是RAMBlock的链表,按照length从大到小排序。RAMBlock的host是HVA,offset是qemu自己引入的介于HVA和GPA之间的一层,length是大小,idstr是名称。每一个RAMBlock对应一个slot,也就是内存插槽。
qemu_ram_alloc直接调用qemu_ram_alloc_from_ptr,主要是申请HVA,求取offset(可放下size的最小空闲空间),设置RAMBlock结构体并按从大到小顺序插入ram_list:
ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
ram_addr_t size, void *host)
{
RAMBlock *block, *new_block;
ram_addr_t old_ram_size, new_ram_size;
//last_ram_offset求取phys_offset空间已用的大小,old_ram_size为phys_offset空间已用页面个数
old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
//页对齐
size = TARGET_PAGE_ALIGN(size);
new_block = g_malloc0(sizeof(*new_block));
new_block->fd = -1;
/* This assumes the iothread lock is taken here too. */
qemu_mutex_lock_ramlist();
//new_block->mr = mr;
//在phys_offset空间中寻找可以存放size的最小的空闲空间的位置
new_block->offset = find_ram_offset(size);
if (host) {
new_block->host = host;
new_block->flags |= RAM_PREALLOC_MASK;
}
//...else if...xen...
else {
//...if...mem_path...
if (!new_block->host) {
//申请HVA,此时未分配HGA,HGA是在kvm发现缺页时再分配的
new_block->host = phys_mem_alloc(size);
if (!new_block->host) {
fprintf(stderr, "Cannot set up guest memory '%s': %s\n",
name, strerror(errno));
exit(1);
}
//...HAXM...
memory_try_enable_merging(new_block->host, size);
}
}
new_block->length = size;
//...if...dev..
// 设置RAMBlock的名字
pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
/* Keep the list sorted from biggest to smallest block. */
QTAILQ_FOREACH(block, &ram_list.blocks, next) {
if (block->length < new_block->length) {
break;
}
}
if (block) {
QTAILQ_INSERT_BEFORE(block, new_block, next);
} else {
QTAILQ_INSERT_TAIL(&ram_list.blocks, new_block, next);
}
ram_list.mru_block = NULL;
ram_list.version++;
qemu_mutex_unlock_ramlist();
//dirty ram
new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
if (new_ram_size > old_ram_size) {
int i;
for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
ram_list.dirty_memory[i] =
bitmap_zero_extend(ram_list.dirty_memory[i],
old_ram_size, new_ram_size);
}
}
cpu_physical_memory_set_dirty_range(new_block->offset, size);
//无用
qemu_ram_setup_dump(new_block->host, size);
if (kvm_enabled())
//无用
kvm_setup_guest_memory(new_block->host, size);
return new_block->offset;
}
cpu_register_physical_memory最终调用的是cpu_register_physical_memory_log,主要对普通内存KVM_SET_USER_MEMORY_REGION,对MMIO不KVM_SET_USER_MEMORY_REGION。然后设置一下普通内存的phys_offset随着每一页的设置而增大,MMIO的phys_offset和io_index相关,不变:
void cpu_register_physical_memory_log(hwaddr start_addr,
ram_addr_t size,
ram_addr_t phys_offset,
ram_addr_t region_offset,
bool log_dirty)
{
hwaddr addr, end_addr;
PhysPageDesc *p;
CPUState *cpu;
ram_addr_t orig_size = size;
subpage_t *subpage;
if (kvm_enabled())
// 如果是普通内存,那么KVM_SET_USER_MEMORY_REGION
// 如果是MMIO,那么修改ram_list和RAMBlock,使对应的内存段没有KVM_SET_USER_MEMORY_REGION,其他内存段不变
kvm_set_phys_mem(start_addr, size, phys_offset);
//...haxm...
if (phys_offset == IO_MEM_UNASSIGNED) {
region_offset = start_addr;
}
region_offset &= TARGET_PAGE_MASK;
// 页对齐
size = (size + TARGET_PAGE_SIZE - 1) & TARGET_PAGE_MASK;
end_addr = start_addr + (hwaddr)size;
addr = start_addr;
do {
p = phys_page_find(addr >> TARGET_PAGE_BITS);
if (p && p->phys_offset != IO_MEM_UNASSIGNED) {
ram_addr_t orig_memory = p->phys_offset;
hwaddr start_addr2, end_addr2;
// ...if...subpage...
else {
p->phys_offset = phys_offset;
if ((phys_offset & ~TARGET_PAGE_MASK) <= IO_MEM_ROM ||
(phys_offset & IO_MEM_ROMD))
// 普通内存的phys_offset需要增加的
// MMIO的phys_offset和io_index相关,不管申请几页,都不变
phys_offset += TARGET_PAGE_SIZE;
}
} else {
p = phys_page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
p->phys_offset = phys_offset;
p->region_offset = region_offset;
if ((phys_offset & ~TARGET_PAGE_MASK) <= IO_MEM_ROM ||
(phys_offset & IO_MEM_ROMD)) {
// 普通内存的phys_offset需要增加的
// MMIO的phys_offset和io_index相关,不管申请几页,都不变
phys_offset += TARGET_PAGE_SIZE;
} else {
hwaddr start_addr2, end_addr2;
// ...if...subpage...
}
}
region_offset += TARGET_PAGE_SIZE;
addr += TARGET_PAGE_SIZE;
} while (addr != end_addr);
/* since each CPU stores ram addresses in its TLB cache, we must
reset the modified entries */
/* XXX: slow ! */
CPU_FOREACH(cpu) {
tlb_flush(cpu->env_ptr, 1);
}
}
phys_page_find_alloc中实现了每一个页都有对应的PhysPageDesc,主要记录了该页对应的phys_offset,从而可以找到HVA以及io_index。注意l1_phys_map是一个多级页表,不仅仅是两级的。
kvm_set_phys_mem用于分配slot,重新分配slot,普通内存KVM_SET_USER_MEMORY_REGION,MMIO不KVM_SET_USER_MEMORY_REGION
void kvm_set_phys_mem(hwaddr start_addr,
ram_addr_t size,
ram_addr_t phys_offset)
{
KVMState *s = kvm_state;
ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
KVMSlot *mem, old;
int err;
// ...页面不对齐的检测...
/* KVM does not support read-only slots */
phys_offset &= ~IO_MEM_ROM;
while (1) {
// 寻找内存空间有重合的slot,重新分配slot的
mem = kvm_lookup_overlapping_slot(s, start_addr, size);
if (!mem) {
break;
}
// 普通内存,且完全重合,不需要重新分配
if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
(start_addr + size <= mem->start_addr + mem->memory_size) &&
(phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
/* The new slot fits into the existing one and comes with
* identical parameters - nothing to be done. */
return;
}
old = *mem;
// 需要重新分配,先注销之前的
/* unregister the overlapping slot */
mem->memory_size = 0;
err = kvm_set_user_memory_region(s, mem);
if (err) {
fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
__func__, strerror(-err));
abort();
}
// ......
// 重合前的一段,重新分配slot
/* register prefix slot */
if (old.start_addr < start_addr) {
mem = kvm_alloc_slot(s);
mem->memory_size = start_addr - old.start_addr;
mem->start_addr = old.start_addr;
mem->phys_offset = old.phys_offset;
mem->flags = 0;
err = kvm_set_user_memory_region(s, mem);
if (err) {
fprintf(stderr, "%s: error registering prefix slot: %s\n",
__func__, strerror(-err));
abort();
}
}
// 重合后的一段,重新分配slot
/* register suffix slot */
if (old.start_addr + old.memory_size > start_addr + size) {
ram_addr_t size_delta;
mem = kvm_alloc_slot(s);
mem->start_addr = start_addr + size;
size_delta = mem->start_addr - old.start_addr;
mem->memory_size = old.memory_size - size_delta;
mem->phys_offset = old.phys_offset + size_delta;
mem->flags = 0;
err = kvm_set_user_memory_region(s, mem);
if (err) {
fprintf(stderr, "%s: error registering suffix slot: %s\n",
__func__, strerror(-err));
abort();
}
}
}
/* in case the KVM bug workaround already "consumed" the new slot */
if (!size)
return;
// 重合的一段,重新分配slot,如果是MMIO,则不分配,且不kvm_set_user_memory_region,将缺页且KVM无法处理,导致KVM_EXIT_MMIO
/* KVM does not need to know about this memory */
if (flags >= IO_MEM_UNASSIGNED)
return;
mem = kvm_alloc_slot(s);
mem->memory_size = size;
mem->start_addr = start_addr;
mem->phys_offset = phys_offset;
mem->flags = 0;
// kvm ioctl KVM_SET_USER_MEMORY_REGION
err = kvm_set_user_memory_region(s, mem);
if (err) {
fprintf(stderr, "%s: error registering slot: %s\n", __func__,
strerror(-err));
abort();
}
}
kvm_set_user_memory_region用于普通内存的KVM_SET_USER_MEMORY_REGION,guest_phys_addr就是GPA,userspace_addr就是HVA,需要根据phys_offset,在ram_list中进行RAMBlock的查找和匹配,然后找到RAMBlock->host,也就是HVA:
static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
{
struct kvm_userspace_memory_region mem;
mem.slot = slot->slot;
mem.guest_phys_addr = slot->start_addr;
mem.memory_size = slot->memory_size;
mem.userspace_addr = (unsigned long)qemu_get_ram_ptr(slot->phys_offset);
mem.flags = slot->flags;
if (s->migration_log) {
mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
}
return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
}
MMIO内存的申请(external/qemu/hw/android/goldfish/device.c),申请的MMIO都是页对齐的,基本上都是申请了1页。具体内容需要看《android
emulator虚拟设备分析第一篇之battery》:
int goldfish_device_add(struct goldfish_device *dev,
CPUReadMemoryFunc **mem_read,
CPUWriteMemoryFunc **mem_write,
void *opaque)
{
int iomemtype;
goldfish_add_device_no_io(dev);
iomemtype = cpu_register_io_memory(mem_read, mem_write, opaque);
cpu_register_physical_memory(dev->base, dev->size, iomemtype);
return 0;
}