QEMU side memory management
The MemoryRegion is the link between guest physical address space and the
RAMBlocks containing the memory. Each MemoryRegion has the ram_addr_t offset
of the RAMBlock and each RAMBlock has a MemoryRegion pointer.
Note that MemoryRegion is more general than just RAM. It can also represent I/O
memory where read/write callback functions are invoked on access. This is how
hardware register accesses from a guest CPU are dispatched to emulated devices.
The address_space_rw() function dispatches load/store accesses to the
appropriate MemoryRegions. If a MemoryRegion is a RAM region then the data will
be accessed from the RAMBlock’s mmapped guest RAM. The address_space_memory
global variable is the guest physical memory space.
http://blog.vmsplice.net/2016/01/qemu-internals-how-guest-physical-ram.html
A region is created by one of the memory_region_init*() functions and attached
to an object, which acts as its owner or parent. QEMU ensures that the owner
object remains alive as long as the region is visible to the guest, or as long
as the region is in use by a virtual CPU or another device. After creation, a
region can be added to an address space or a container with
memory_region_add_subregion(), and removed using memory_region_del_subregion().
https://qemu.weilnetz.de/doc/devel/memory.html
Important data structures managing VMs on QEMU side
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
| struct KVMState
{
AccelState parent_obj;
int nr_slots;
int fd;
int vmfd;
int coalesced_mmio;
int coalesced_pio;
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
bool coalesced_flush_in_progress;
int vcpu_events;
int robust_singlestep;
int debugregs;
......
KVMMemoryListener memory_listener;
QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
/* For "info mtree -f" to tell if an MR is registered in KVM */
int nr_as;
struct KVMAs {
KVMMemoryListener *ml;
AddressSpace *as;
} *as;
|
In QEMU, KVMState is a data structure that represents the state of a virtual
machine managed by the Kernel-based Virtual Machine (KVM) hypervisor. The
KVMState structure contains various fields that represent the virtual machine’s
CPU state, memory state, device state, and other properties.
The KVMState structure is used extensively throughout QEMU to manage the state
of virtual machines running under the KVM hypervisor. QEMU uses the KVMState
structure to initialize and configure the virtual machine, handle interrupts and
other events, and manage interactions between the virtual machine and the host.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
| struct MemoryRegion {
Object parent_obj;
/* private: */
/* The following fields should fit in a cache line */
bool romd_mode;
bool ram;
bool subpage;
bool readonly; /* For RAM regions */
bool nonvolatile;
bool rom_device;
bool flush_coalesced_mmio;
uint8_t dirty_log_mask;
bool is_iommu;
RAMBlock *ram_block;
Object *owner;
const MemoryRegionOps *ops;
......
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| /**
* struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
*/
struct AddressSpace {
/* private: */
struct rcu_head rcu;
char *name;
MemoryRegion *root;
/* Accessed via RCU. */
struct FlatView *current_map;
int ioeventfd_nb;
struct MemoryRegionIoeventfd *ioeventfds;
QTAILQ_HEAD(, MemoryListener) listeners;
QTAILQ_ENTRY(AddressSpace) address_spaces_link;
};
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
| struct RAMBlock {
struct rcu_head rcu;
struct MemoryRegion *mr;
uint8_t *host;
uint8_t *colo_cache; /* For colo, VM's ram cache */
ram_addr_t offset;
ram_addr_t used_length;
ram_addr_t max_length;
void (*resized)(const char*, uint64_t length, void *host);
uint32_t flags;
/* Protected by iothread lock. */
char idstr[256];
/* RCU-enabled, writes protected by the ramlist lock */
QLIST_ENTRY(RAMBlock) next;
QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
int fd;
int private_fd;
size_t page_size;
/* dirty bitmap used during migration */
unsigned long *bmap;
/* bitmap of already received pages in postcopy */
unsigned long *receivedmap;
|
For TDX
Because RAMBlock is a basic unit to manage the memories for the VM instances,
it extends the RAMBlock to support private memory and shared memory in one
RAMBlock. Note that it has two member fields (fd and private_fd) for shared and
private address space, respectively.
Memory region & address space initialization
Global structures for memory and IO for VMs
1
2
3
4
5
| static MemoryRegion *system_memory;
static MemoryRegion *system_io;
AddressSpace address_space_io;
AddressSpace address_space_memory;
|
Globally, regardless of the architecture, MemoryRegion and AddressSpace for
system memory and IO are required to provide memory and IO for the VMs.
Initialize address space
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
| static void qemu_create_machine(QDict *qdict)
{
MachineClass *machine_class = select_machine(qdict, &error_fatal);
object_set_machine_compat_props(machine_class->compat_props);
set_memory_options(machine_class);
current_machine = MACHINE(object_new_with_class(OBJECT_CLASS(machine_class)));
object_property_add_child(object_get_root(), "machine",
OBJECT(current_machine));
object_property_add_child(container_get(OBJECT(current_machine),
"/unattached"),
"sysbus", OBJECT(sysbus_get_default()));
if (machine_class->minimum_page_bits) {
if (!set_preferred_target_page_bits(machine_class->minimum_page_bits)) {
/* This would be a board error: specifying a minimum smaller than
* a target's compile-time fixed setting.
*/
g_assert_not_reached();
}
}
cpu_exec_init_all();
......
}
|
1
2
3
4
5
6
7
8
| void cpu_exec_init_all(void)
{
qemu_mutex_init(&ram_list.mutex);
finalize_target_page_bits();
io_mem_init();
memory_map_init();
qemu_mutex_init(&map_client_list_lock);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
| static void memory_map_init(void)
{
system_memory = g_malloc(sizeof(*system_memory));
memory_region_init(system_memory, NULL, "system", UINT64_MAX);
address_space_init(&address_space_memory, system_memory, "memory");
system_io = g_malloc(sizeof(*system_io));
memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
65536);
address_space_init(&address_space_io, system_io, "I/O");
}
|
This is where the system_memory and system_io memory region is allocated and
initialized. memory_region_init function initialize some member fields such as
name and size. The initialized memory region is registered as the root memory
region of corresponding address spaces by the address_space_init function.
1
2
3
4
5
6
7
8
9
10
11
12
13
| void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
{
memory_region_ref(root);
as->root = root;
as->current_map = NULL;
as->ioeventfd_nb = 0;
as->ioeventfds = NULL;
QTAILQ_INIT(&as->listeners);
QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
as->name = g_strdup(name ? name : "anonymous");
address_space_update_topology(as);
address_space_update_ioeventfdy(as);
}
|
Now the system_memory and system_io are registered as the root memory regions of
the address_space_memory and address_space_io, respectively. However, note that
it just established links between the memory region and the address space, and
there is no actual memory registered for those two address spaces.
Real memory allocation through RAMBlock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
| /* PC hardware initialisation */
static void pc_init1(MachineState *machine,
const char *host_type, const char *pci_type)
{
PCMachineState *pcms = PC_MACHINE(machine);
PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
X86MachineState *x86ms = X86_MACHINE(machine);
MemoryRegion *system_memory = get_system_memory();
MemoryRegion *system_io = get_system_io();
......
if (pcmc->pci_enabled) {
pci_memory = g_new(MemoryRegion, 1);
memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
rom_memory = pci_memory;
} else {
......
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
pc_memory_init(pcms, system_memory,
rom_memory, &ram_memory);
} else {
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
| void pc_memory_init(PCMachineState *pcms,
MemoryRegion *system_memory,
MemoryRegion *rom_memory,
MemoryRegion **ram_memory)
{
int linux_boot, i;
MemoryRegion *option_rom_mr;
MemoryRegion *ram_below_4g, *ram_above_4g;
......
*ram_memory = machine->ram;
ram_below_4g = g_malloc(sizeof(*ram_below_4g));
memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
0, x86ms->below_4g_mem_size);
memory_region_add_subregion(system_memory, 0, ram_below_4g);
e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
if (x86ms->above_4g_mem_size > 0) {
ram_above_4g = g_malloc(sizeof(*ram_above_4g));
memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
machine->ram,
x86ms->below_4g_mem_size,
x86ms->above_4g_mem_size);
memory_region_add_subregion(system_memory, 0x100000000ULL,
ram_above_4g);
e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
}
|
Register new memory region as a subregion
memory_region_add_subregion function registers the newly generated memory region
to the existing memory region as a sub-region. The param mr is the existing
memory region, and the subregion is the newly generated memory region that will
be registered to the mr. The offset is the GPA of the new memory region.
1
2
3
4
5
6
7
| void memory_region_add_subregion(MemoryRegion *mr,
hwaddr offset,
MemoryRegion *subregion)
{
subregion->priority = 0;
memory_region_add_subregion_common(mr, offset, subregion);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| static void memory_region_add_subregion_common(MemoryRegion *mr,
hwaddr offset,
MemoryRegion *subregion)
{
MemoryRegion *alias;
assert(!subregion->container);
subregion->container = mr;
for (alias = subregion->alias; alias; alias = alias->alias) {
alias->mapped_via_alias++;
}
subregion->addr = offset;
memory_region_update_container_subregions(subregion);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| static void memory_region_update_container_subregions(MemoryRegion *subregion)
{
MemoryRegion *mr = subregion->container;
MemoryRegion *other;
memory_region_transaction_begin();
memory_region_ref(subregion);
QTAILQ_FOREACH(other, &mr->subregions, subregions_link) {
if (subregion->priority >= other->priority) {
QTAILQ_INSERT_BEFORE(other, subregion, subregions_link);
goto done;
}
}
QTAILQ_INSERT_TAIL(&mr->subregions, subregion, subregions_link);
done:
memory_region_update_pending |= mr->enabled && subregion->enabled;
memory_region_transaction_commit();
}
|
Because the sub-regions of one memory region are maintained by the subregions
member field of the root MemoryRegion, the newly generated sub-region should
also be registered in that list. Now the sub-region is successfully registered
in the subregions list of the root MemoryRegion. However, note that this link
was established on QEMU side, which further requires KVM side memory management.
Remember that the subregions are maintained as a tree form and should be
translated into the flatview to be processed by the KVM side easily. Refer
to [[]] about flatview.
Notify new memory region to KVM side
1
2
3
4
5
6
7
8
| static void kvm_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
memory_region_ref(section->mr);
kvm_set_phys_mem(kml, section, true);
}
|
kvm_region_add function is registered as the region_add hook function of the
KVMMemoryListener. Therefore, it will be invoked whenever XXX (Not sure when
it is actually invoked..)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
| static void kvm_set_phys_mem(KVMMemoryListener *kml,
MemoryRegionSection *section, bool add)
{
KVMSlot *mem;
int err;
MemoryRegion *mr = section->mr;
bool writeable = !mr->readonly && !mr->rom_device;
hwaddr start_addr, size, slot_size, mr_offset;
ram_addr_t ram_start_offset;
void *ram;
if (!memory_region_is_ram(mr)) {
if (writeable || !kvm_readonly_mem_allowed) {
return;
} else if (!mr->romd_mode) {
/* If the memory device is not in romd_mode, then we actually want
* to remove the kvm memory slot so all accesses will trap. */
add = false;
}
}
size = kvm_align_section(section, &start_addr);
if (!size) {
return;
}
/* The offset of the kvmslot within the memory region */
mr_offset = section->offset_within_region + start_addr -
section->offset_within_address_space;
/* use aligned delta to align the ram address and offset */
ram = memory_region_get_ram_ptr(mr) + mr_offset;
ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
kvm_slots_lock();
if (!add) {
do {
slot_size = MIN(kvm_max_slot_size, size);
mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
if (!mem) {
goto out;
}
if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
/*
* NOTE: We should be aware of the fact that here we're only
* doing a best effort to sync dirty bits. No matter whether
* we're using dirty log or dirty ring, we ignored two facts:
*
* (1) dirty bits can reside in hardware buffers (PML)
*
* (2) after we collected dirty bits here, pages can be dirtied
* again before we do the final KVM_SET_USER_MEMORY_REGION to
* remove the slot.
*
* Not easy. Let's cross the fingers until it's fixed.
*/
if (kvm_state->kvm_dirty_ring_size) {
kvm_dirty_ring_reap_locked(kvm_state);
} else {
kvm_slot_get_dirty_log(kvm_state, mem);
}
kvm_slot_sync_dirty_pages(mem);
}
/* unregister the slot */
g_free(mem->dirty_bmap);
mem->dirty_bmap = NULL;
mem->memory_size = 0;
mem->flags = 0;
err = kvm_set_user_memory_region(kml, mem, false);
if (err) {
fprintf(stderr, "%s: error unregistering slot: %s\n",
__func__, strerror(-err));
abort();
}
start_addr += slot_size;
size -= slot_size;
} while (size);
goto out;
}
/* register the new slot */
do {
slot_size = MIN(kvm_max_slot_size, size);
mem = kvm_alloc_slot(kml);
mem->as_id = kml->as_id;
mem->memory_size = slot_size;
mem->start_addr = start_addr;
mem->ram_start_offset = ram_start_offset;
mem->ram = ram;
mem->flags = kvm_mem_flags(mr);
if (mem->flags & KVM_MEM_PRIVATE) {
mem->fd = mr->ram_block->private_fd;
mem->ofs = (uint8_t*)ram - mr->ram_block->host;
} else {
mem->fd = -1;
mem->ofs = -1;
}
kvm_slot_init_dirty_bitmap(mem);
err = kvm_set_user_memory_region(kml, mem, true);
if (err) {
fprintf(stderr, "%s: error registering slot: %s\n", __func__,
strerror(-err));
abort();
}
start_addr += slot_size;
ram_start_offset += slot_size;
ram += slot_size;
size -= slot_size;
} while (size);
out:
kvm_slots_unlock();
}
|
Get KVMSlot for new memory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| typedef struct KVMSlot
{
hwaddr start_addr;
ram_addr_t memory_size;
void *ram;
int slot;
int flags;
int old_flags;
/* Dirty bitmap cache for the slot */
unsigned long *dirty_bmap;
unsigned long dirty_bmap_size;
/* Cache of the address space ID */
int as_id;
/* Cache of the offset in ram address space */
ram_addr_t ram_start_offset;
int fd;
hwaddr ofs;
} KVMSlot;
|
1
2
3
4
5
6
7
8
9
10
11
12
13
| /* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
{
KVMSlot *slot = kvm_get_free_slot(kml);
if (slot) {
return slot;
}
fprintf(stderr, "%s: no free slot available\n", __func__);
abort();
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| /* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
{
KVMState *s = kvm_state;
int i;
for (i = 0; i < s->nr_slots; i++) {
if (kml->slots[i].memory_size == 0) {
return &kml->slots[i];
}
}
return NULL;
}
|
KVM_SET_USER_MEMORY_REGION ioctl to communicate with KVM
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
| static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
{
KVMState *s = kvm_state;
struct kvm_userspace_memory_region_ext mem;
int ret;
mem.region.slot = slot->slot | (kml->as_id << 16);
mem.region.guest_phys_addr = slot->start_addr;
mem.region.userspace_addr = (unsigned long)slot->ram;
mem.region.flags = slot->flags;
if (slot->flags & KVM_MEM_PRIVATE) {
mem.private_fd = slot->fd;
mem.private_offset = slot->ofs;
} else {
mem.private_fd = -1;
mem.private_offset = -1;
}
if (kvm_tdx_enabled() && !(slot->flags & KVM_MEM_PRIVATE)) {
warn_report("%s: Non-private memory backend is used for TDX"
" slot %d,"
" start_addr 0x%" PRIx64 ","
" ram 0x%" PRIx64 ","
" size 0x%" PRIx64 ","
" flags 0x%x",
__func__, mem.region.slot, slot->start_addr,
(uint64_t)slot->ram, slot->memory_size, slot->flags);
}
if (slot->memory_size && !new &&
(slot->flags ^ slot->old_flags) & KVM_MEM_READONLY) {
/* Set the slot size to 0 before setting the slot to the desired
* value. This is needed based on KVM commit 75d61fbc. */
mem.region.memory_size = 0;
ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
if (ret < 0) {
goto err;
}
}
mem.region.memory_size = slot->memory_size;
ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
slot->old_flags = mem.region.flags;
err:
trace_kvm_set_user_memory(mem.region.slot, mem.region.flags,
mem.region.guest_phys_addr, mem.region.memory_size,
mem.region.userspace_addr, ret);
if (ret < 0) {
error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
" start=0x%" PRIx64 ", size=0x%" PRIx64 ","
" flags=0x%" PRIx32 ","
" private_fd=%" PRId32 ", private_offset=0x%" PRIx64 ": %s",
__func__, mem.region.slot, slot->start_addr,
(uint64_t)mem.region.memory_size, mem.region.flags,
mem.private_fd, (uint64_t)mem.private_offset,
strerror(errno));
}
return ret;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| /* for KVM_SET_USER_MEMORY_REGION */
struct kvm_userspace_memory_region {
__u32 slot;
__u32 flags;
__u64 guest_phys_addr;
__u64 memory_size; /* bytes */
__u64 userspace_addr; /* start of the userspace allocated memory */
};
struct kvm_userspace_memory_region_ext {
struct kvm_userspace_memory_region region;
__u64 private_offset;
__u32 private_fd;
__u32 pad1;
__u64 pad2[14];
};
|
KVM_SET_USER_MEMORY_REGION ioctl allows kvm to initialize memory set by the
QEMU. It also generates the kvm_userspace_memory_region_ext type parameter to
let kvm understand which memory region should be created and assigned for the
guest VM.
Allocate and assign memory for new memory region
After the QEMU and KVM initialization for root memory regions of system memory
and IO, another memory region might need to be generated and registered as a
sub-region of the other region. For example guest VM requires virtual memory.
QEMU reserves MemoryRegion called RAM region and makes them available to the
guest VM as needed.
The actual host memory region is represented as RAMBlock, and it is allocated
by memory_region_init_ram function.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
| void memory_region_init_ram(MemoryRegion *mr,
Object *owner,
const char *name,
uint64_t size,
Error **errp)
{
DeviceState *owner_dev;
Error *err = NULL;
memory_region_init_ram_nomigrate(mr, owner, name, size, &err);
if (err) {
error_propagate(errp, err);
return;
}
/* This will assert if owner is neither NULL nor a DeviceState.
* We only want the owner here for the purposes of defining a
* unique name for migration. TODO: Ideally we should implement
* a naming scheme for Objects which are not DeviceStates, in
* which case we can relax this restriction.
*/
owner_dev = DEVICE(owner);
vmstate_register_ram(mr, owner_dev);
}
|
The MemoryRegion parameter is the newly allocated memory region that needs to be
initialized. Note that this function’s main role is an allocating memory space
for current memory region. The memory region is a meta-data presenting the
actual memory. The actual memory space assigned for this memory region is
presented by the RAMBlock type member field ram_block.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
| void memory_region_init_ram_nomigrate(MemoryRegion *mr,
Object *owner,
const char *name,
uint64_t size,
Error **errp)
{
memory_region_init_ram_flags_nomigrate(mr, owner, name, size, 0, errp);
}
void memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
Object *owner,
const char *name,
uint64_t size,
uint32_t ram_flags,
Error **errp)
{
Error *err = NULL;
memory_region_init(mr, owner, name, size);
mr->ram = true;
mr->terminates = true;
mr->destructor = memory_region_destructor_ram;
mr->ram_block = qemu_ram_alloc(size, ram_flags, mr, &err);
if (err) {
mr->size = int128_zero();
object_unparent(OBJECT(mr));
error_propagate(errp, err);
}
}
|
RAMBlock is allocated and assigned to the current memory region through the
qemu_ram_alloc function. Note that the allocated memory is assigned to the
ram_block member field of the current memory region. The memory is the part of
the QEMU’s address space which is a HVA in terms of virtualization.
Render flatview
To render the flatview, it is required to invoke flatviews_init,
generate_memory_topology, and address_space_set_flatview. The flatviews_reset
function does invoke first two functions flatviews_init and
generate_memory_topology. After the flatviews are reset, it invokes the last
function, address_space_set_flatview, to update the QEMU and KVM data structures
corresponding to the new memory region addition. Recall that the last function
further invokes address_space_update_topology_pass and region_add of the
listener.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
| void memory_region_transaction_commit(void)
{
AddressSpace *as;
assert(memory_region_transaction_depth);
assert(qemu_mutex_iothread_locked());
--memory_region_transaction_depth;
if (!memory_region_transaction_depth) {
if (memory_region_update_pending) {
flatviews_reset();
MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
address_space_set_flatview(as);
address_space_update_ioeventfds(as);
}
memory_region_update_pending = false;
ioeventfd_update_pending = false;
MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
} else if (ioeventfd_update_pending) {
QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
address_space_update_ioeventfds(as);
}
ioeventfd_update_pending = false;
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
| static void flatviews_reset(void)
{
AddressSpace *as;
if (flat_views) {
g_hash_table_unref(flat_views);
flat_views = NULL;
}
flatviews_init();
/* Render unique FVs */
QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
MemoryRegion *physmr = memory_region_get_flatview_root(as->root);
if (g_hash_table_lookup(flat_views, physmr)) {
continue;
}
generate_memory_topology(physmr);
}
}
|
Note that this function reset the flat_views and generate the memory topology
for existing address space. Note that generate_memory_topology function is
called, when the memory topology for address space doesn’t exist.
XXX{Move below to up}
Create a series of MemoryRegions, which respectively represent the RAM, ROM and
other areas in the Guest. The relationship between MemoryRegions is maintained
through alias or subregions, thereby further refining the definition of the
region AddressSpace represents the physical address space of the Guest.
If the
MemoryRegion in AddressSpace changes, the registered listener will be triggered,
expand the MemoryRegion tree to generate a one-dimensional FlatView, and compare
whether the FlatRange has changed. If it is, call the corresponding method to
check the MemoryRegionSection, update the KVMSlot in QEMU, and fill in the
kvm_userspace_memory_regionstructure at the same time, as ioctl()a parameter to
update the KVM in KVMkvm_memory_slot
[[]]
http://blog.vmsplice.net/2016/01/qemu-internals-how-guest-physical-ram.html
Register KVM memory listener to AddressSpace
Adding actual memories to the address space is done through the registered
listener. The registered KVM listener bridges the QEMU and KVM module so that
its operation such as region_add and region_del talks to KVM module and register
memory to the VMs managed by the QEMU. Note that KVM managed actual memories,
but the QEMU assigns the memory for the guest VM. When memory related events
happen on the address space where the KVM listener is registered to, it delivers
the event to KVM module.
1
2
3
4
5
6
7
8
9
| static int kvm_init(MachineState *ms)
{
.....
kvm_state = s;
ret = kvm_arch_init(ms, s);
......
kvm_memory_listener_register(s, &s->memory_listener,
&address_space_memory, 0, "kvm-memory");
|
1
2
3
4
5
| typedef struct KVMMemoryListener {
MemoryListener listener;
KVMSlot *slots;
int as_id;
} KVMMemoryListener;
|
Note that memory_listener is KVMMemoryListener type variable which is member
field of KVMState. Also struct MemoryListener in KVMMemoryListener provides
callbacks functions required for updates to the physical memory map.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
| void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
AddressSpace *as, int as_id, const char *name)
{
int i;
kml->slots = g_new0(KVMSlot, s->nr_slots);
kml->as_id = as_id;
for (i = 0; i < s->nr_slots; i++) {
kml->slots[i].slot = i;
}
kml->listener.region_add = kvm_region_add;
kml->listener.region_del = kvm_region_del;
kml->listener.log_start = kvm_log_start;
kml->listener.log_stop = kvm_log_stop;
kml->listener.priority = 10;
kml->listener.name = name;
if (s->kvm_dirty_ring_size) {
kml->listener.log_sync_global = kvm_log_sync_global;
} else {
kml->listener.log_sync = kvm_log_sync;
kml->listener.log_clear = kvm_log_clear;
}
memory_listener_register(&kml->listener, as);
for (i = 0; i < s->nr_as; ++i) {
if (!s->as[i].as) {
s->as[i].as = as;
s->as[i].ml = kml;
break;
}
}
}
|
kvm_memory_listener_register function allocates the callback functions needed
for updates to physical memory map. Note that it assigns kvm_region_{add, del}
functions as callbacks.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
| void memory_listener_register(MemoryListener *listener, AddressSpace *as)
{
MemoryListener *other = NULL;
/* Only one of them can be defined for a listener */
assert(!(listener->log_sync && listener->log_sync_global));
listener->address_space = as;
if (QTAILQ_EMPTY(&memory_listeners)
|| listener->priority >= QTAILQ_LAST(&memory_listeners)->priority) {
QTAILQ_INSERT_TAIL(&memory_listeners, listener, link);
} else {
QTAILQ_FOREACH(other, &memory_listeners, link) {
if (listener->priority < other->priority) {
break;
}
}
QTAILQ_INSERT_BEFORE(other, listener, link);
}
if (QTAILQ_EMPTY(&as->listeners)
|| listener->priority >= QTAILQ_LAST(&as->listeners)->priority) {
QTAILQ_INSERT_TAIL(&as->listeners, listener, link_as);
} else {
QTAILQ_FOREACH(other, &as->listeners, link_as) {
if (listener->priority < other->priority) {
break;
}
}
QTAILQ_INSERT_BEFORE(other, listener, link_as);
}
listener_add_address_space(listener, as);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
| static void listener_add_address_space(MemoryListener *listener,
AddressSpace *as)
{
FlatView *view;
FlatRange *fr;
if (listener->begin) {
listener->begin(listener);
}
if (global_dirty_tracking) {
if (listener->log_global_start) {
listener->log_global_start(listener);
}
}
view = address_space_get_flatview(as);
FOR_EACH_FLAT_RANGE(fr, view) {
MemoryRegionSection section = section_from_flat_range(fr, view);
if (listener->region_add) {
listener->region_add(listener, §ion);
}
if (fr->dirty_log_mask && listener->log_start) {
listener->log_start(listener, §ion, 0, fr->dirty_log_mask);
}
}
if (listener->commit) {
listener->commit(listener);
}
flatview_unref(view);
}
|
Recall that we were adding listener to the address_space_memory during the kvm
initialization (kvm_init). Therefore, the address_space_memory is empty at this
moment, and it will not invoke the region_add function of the listener.
FlatView as a medium between QEMU and KVM
The domain of AddressSpace root and its subtree together constitute the physical
address space of the Guest, but these are defined on the QEMU side. When passing
in KVM for setting, the complex tree structure is not conducive to the kernel’s
processing, so it needs to be converted into a “flat” address model, that is, a
data structure that starts from zero and only contains address information. This
is represented by FlatView in QEMU . Each AddressSpace has a corresponding
FlatView pointer current_map, indicating its corresponding flat view.
1
2
3
4
5
6
7
8
9
10
| static void address_space_update_topology(AddressSpace *as)
{
MemoryRegion *physmr = memory_region_get_flatview_root(as->root);
flatviews_init();
if (!g_hash_table_lookup(flat_views, physmr)) {
generate_memory_topology(physmr);
}
address_space_set_flatview(as);
}
|
Recall that each AddressSpace has root memory region associated with it. For
example, for address_space_memory has system_memory as its root memory region.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr)
{
while (mr->enabled) {
if (mr->alias) {
if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size)) {
/* The alias is included in its entirety. Use it as
* the "real" root, so that we can share more FlatViews.
*/
mr = mr->alias;
continue;
}
} else if (!mr->terminates) {
unsigned int found = 0;
MemoryRegion *child, *next = NULL;
QTAILQ_FOREACH(child, &mr->subregions, subregions_link) {
if (child->enabled) {
if (++found > 1) {
next = NULL;
break;
}
if (!child->addr && int128_ge(mr->size, child->size)) {
/* A child is included in its entirety. If it's the only
* enabled one, use it in the hope of finding an alias down the
* way. This will also let us share FlatViews.
*/
next = child;
}
}
}
if (found == 0) {
return NULL;
}
if (next) {
mr = next;
continue;
}
}
return mr;
}
return NULL;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
| /* Render a memory topology into a list of disjoint absolute ranges. */
static FlatView *generate_memory_topology(MemoryRegion *mr)
{
int i;
FlatView *view;
view = flatview_new(mr);
if (mr) {
render_memory_region(view, mr, int128_zero(),
addrrange_make(int128_zero(), int128_2_64()),
false, false);
}
flatview_simplify(view);
view->dispatch = address_space_dispatch_new(view);
for (i = 0; i < view->nr; i++) {
MemoryRegionSection mrs =
section_from_flat_range(&view->ranges[i], view);
flatview_add_to_dispatch(view, &mrs);
}
address_space_dispatch_compact(view->dispatch);
g_hash_table_replace(flat_views, mr, view);
return view;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| static void address_space_set_flatview(AddressSpace *as)
{
FlatView *old_view = address_space_to_flatview(as);
MemoryRegion *physmr = memory_region_get_flatview_root(as->root);
FlatView *new_view = g_hash_table_lookup(flat_views, physmr);
assert(new_view);
if (old_view == new_view) {
return;
}
if (old_view) {
flatview_ref(old_view);
}
flatview_ref(new_view);
if (!QTAILQ_EMPTY(&as->listeners)) {
FlatView tmpview = { .nr = 0 }, *old_view2 = old_view;
if (!old_view2) {
old_view2 = &tmpview;
}
address_space_update_topology_pass(as, old_view2, new_view, false);
address_space_update_topology_pass(as, old_view2, new_view, true);
}
/* Writes are protected by the BQL. */
qatomic_rcu_set(&as->current_map, new_view);
if (old_view) {
flatview_unref(old_view);
}
/* Note that all the old MemoryRegions are still alive up to this
* point. This relieves most MemoryListeners from the need to
* ref/unref the MemoryRegions they get---unless they use them
* outside the iothread mutex, in which case precise reference
* counting is necessary.
*/
if (old_view) {
flatview_unref(old_view);
}
}
|
Each AddressSpace has dedicated listener (please refer to [[]]), so it will
invoke the address_space_update_topology_pass function.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
| static void address_space_update_topology_pass(AddressSpace *as,
const FlatView *old_view,
const FlatView *new_view,
bool adding)
{
unsigned iold, inew;
FlatRange *frold, *frnew;
......
} else {
/* In new */
if (adding) {
MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_add);
flat_range_coalesced_io_add(frnew, as);
}
++inew;
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
| /* No need to ref/unref .mr, the FlatRange keeps it alive. */
#define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...) \
do { \
MemoryRegionSection mrs = section_from_flat_range(fr, \
address_space_to_flatview(as)); \
MEMORY_LISTENER_CALL(as, callback, dir, &mrs, ##_args); \
} while(0)
#define MEMORY_LISTENER_CALL(_as, _callback, _direction, _section, _args...) \
do { \
MemoryListener *_listener; \
\
switch (_direction) { \
case Forward: \
QTAILQ_FOREACH(_listener, &(_as)->listeners, link_as) { \
if (_listener->_callback) { \
_listener->_callback(_listener, _section, ##_args); \
} \
} \
break; \
case Reverse: \
QTAILQ_FOREACH_REVERSE(_listener, &(_as)->listeners, link_as) { \
if (_listener->_callback) { \
_listener->_callback(_listener, _section, ##_args); \
} \
} \
break; \
default: \
abort(); \
} \
} while (0)
|
Comments powered by Disqus.