内存热插流程

1. qemu内存参数

qemu中内存参数列表(如下)中增加两个参数解析slots和maxmem,这两个分别代码了内存的槽和最大内存,而size就变成了当前内存。maxmem是内存的上限,而slots则代码可用于热插内存时需要的槽的上限。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static QemuOptsList qemu_mem_opts = {
.name = "memory",
.implied_opt_name = "size",
.head = QTAILQ_HEAD_INITIALIZER(qemu_mem_opts.head),
.merge_lists = true,
.desc = {
{
.name = "size",
.type = QEMU_OPT_SIZE,
},
{
.name = "slots",
.type = QEMU_OPT_NUMBER,
},
{
.name = "maxmem",
.type = QEMU_OPT_SIZE,
},
{ /* end of list */ }
},
};

之后在main函数中对memory命令行参数解析的时候,解析slot和maxmem参数,并设置到current_machine中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
int main(int argc, char **argv, char **envp)
{
...
ram_addr_t maxram_size = default_ram_size;
uint64_t ram_slots = 0;
...
case QEMU_OPTION_m: {
uint64_t sz;
const char *mem_str;
const char *maxmem_str, *slots_str;
...
maxmem_str = qemu_opt_get(opts, "maxmem");
slots_str = qemu_opt_get(opts, "slots");
if (maxmem_str && slots_str) {
uint64_t slots;

sz = qemu_opt_get_size(opts, "maxmem", 0);
if (sz < ram_size) {
fprintf(stderr, "qemu: invalid -m option value: maxmem "
"(%" PRIu64 ") <= initial memory (%"
PRIu64 ")\n", sz, ram_size);
exit(EXIT_FAILURE);
}

slots = qemu_opt_get_number(opts, "slots", 0);
if ((sz > ram_size) && !slots) {
fprintf(stderr, "qemu: invalid -m option value: maxmem "
"(%" PRIu64 ") more than initial memory (%"
PRIu64 ") but no hotplug slots where "
"specified\n", sz, ram_size);
exit(EXIT_FAILURE);
}

if ((sz <= ram_size) && slots) {
fprintf(stderr, "qemu: invalid -m option value: %"
PRIu64 " hotplug slots where specified but "
"maxmem (%" PRIu64 ") <= initial memory (%"
PRIu64 ")\n", slots, sz, ram_size);
exit(EXIT_FAILURE);
}
maxram_size = sz;
ram_slots = slots;
} else if ((!maxmem_str && slots_str) ||
(maxmem_str && !slots_str)) {
fprintf(stderr, "qemu: invalid -m option value: missing "
"'%s' option\n", slots_str ? "maxmem" : "slots");
exit(EXIT_FAILURE);
}
...
current_machine->maxram_size = maxram_size;
current_machine->ram_slots = ram_slots;

2. 热插内存接口

  1. qemu热插内存时需要两步
    • “object_add”: creates a memory backend object
    • “device_add”: creates a front-end pc-dimm device and inserts it into the first empty slot
  2. 举例
    • (qemu) object_add memory-backend-ram,id=mem1,size=1G
    • (qemu) device_add pc-dimm,id=dimm1,memdev=mem1
  3. object_add接口分析
    object_add命令用于创建新的object,示例中创建的object是memory-backend-ram(参考hostmem-ram.c),在monitor_init_qmp_commands中注册了object_add命令对应的执行函数qmp_object_add(如下),qmp_object_add接口会先获取qom-type,在上面示例中为memory-bachend-ram,之后获取id(mem1),然后获取属性(size=1G)
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    void qmp_object_add(QDict *qdict, QObject **ret_data, Error **errp)
    {
    QObject *props;
    QDict *pdict;
    Visitor *v;
    Object *obj;
    g_autofree char *type = NULL;
    g_autofree char *id = NULL;
    /*qom-type = memory-bachend-ram*/
    type = g_strdup(qdict_get_try_str(qdict, "qom-type"));
    if (!type) {
    error_setg(errp, QERR_MISSING_PARAMETER, "qom-type");
    return;
    }
    qdict_del(qdict, "qom-type");

    id = g_strdup(qdict_get_try_str(qdict, "id"));
    if (!id) {
    error_setg(errp, QERR_MISSING_PARAMETER, "id");
    return;
    }
    qdict_del(qdict, "id");

    props = qdict_get(qdict, "props");
    if (props) {
    pdict = qobject_to(QDict, props);
    if (!pdict) {
    error_setg(errp, QERR_INVALID_PARAMETER_TYPE, "props", "dict");
    return;
    }
    qobject_ref(pdict);
    qdict_del(qdict, "props");
    qdict_join(qdict, pdict, false);
    if (qdict_size(pdict) != 0) {
    error_setg(errp, "Option in 'props' conflicts with top level");
    qobject_unref(pdict);
    return;
    }
    qobject_unref(pdict);
    }

    v = qobject_input_visitor_new(QOBJECT(qdict));
    obj = user_creatable_add_type(type, id, qdict, v, errp);
    visit_free(v);
    if (obj) {
    object_unref(obj);
    }
    *ret_data = QOBJECT(qdict_new());
    }

qobject_input_visitor_new用于封装属性参数,以便object对象设置属性的时候可以转换属性值的类型,user_creatable_add_type是真正创建对象的地方:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
Object *user_creatable_add_type(const char *type, const char *id,
const QDict *qdict,
Visitor *v, Error **errp)
{
Object *obj;
ObjectClass *klass;
const QDictEntry *e;
Error *local_err = NULL;

klass = object_class_by_name(type);
if (!klass) {
error_setg(errp, "invalid object type: %s", type);
return NULL;
}
/*type对应的TypeInfo必须具有值为TYPE_USER_CREATABLE的interfaces*/
if (!object_class_dynamic_cast(klass, TYPE_USER_CREATABLE)) {
error_setg(errp, "object type '%s' isn't supported by object-add",
type);
return NULL;
}

if (object_class_is_abstract(klass)) {
error_setg(errp, "object type '%s' is abstract", type);
return NULL;
}

assert(qdict);
/*根据type创建object*/
obj = object_new(type);
visit_start_struct(v, NULL, NULL, 0, &local_err);
if (local_err) {
goto out;
}
/*将object_add命令行中的props参数设置为object属性*/
for (e = qdict_first(qdict); e; e = qdict_next(qdict, e)) {
/*这里object_property_set最终调用host_memory_backend_set_size*/
object_property_set(obj, v, e->key, &local_err);
if (local_err) {
break;
}
}
if (!local_err) {
visit_check_struct(v, &local_err);
}
visit_end_struct(v, NULL);
if (local_err) {
goto out;
}

if (id != NULL) {
/*将刚刚object_new出来的obj设置为root object的child属性里t*/
object_property_add_child(object_get_objects_root(),
id, obj, &local_err);
if (local_err) {
goto out;
}
}
/*执行complete回调,memory-bachend-ram注册的complete回调函数为
host_memory_backend_memory_complete函数*/
user_creatable_complete(USER_CREATABLE(obj), &local_err);
if (local_err) {
if (id != NULL) {
object_property_del(object_get_objects_root(),
id, &error_abort);
}
goto out;
}
out:
if (local_err) {
error_propagate(errp, local_err);
object_unref(obj);
return NULL;
}
return obj;
}

memory-bachend-ram的父object为memory-bachend,所以memory-bachend-ram继承了父object的UserCreatableClass,所以当上面user_creatable_complete中complete最终调用的是memory-bachend的class_init中注册的complete回调函数host_memory_backend_memory_complete,此函数负载内存申请以及为申请的内存创建numa策略:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
static void
host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
{
HostMemoryBackend *backend = MEMORY_BACKEND(uc);
HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
Error *local_err = NULL;
void *ptr;
uint64_t sz;
/*memory-backend-ram类型的class_init函数中将alloc回调函数被设置为
ram_backend_memory_alloc函数,这个函数用于创建一个memory_region
,并为其申请backend->size(size就是通过host_memory_backend_set_size
设置的)大小的内存*/
if (bc->alloc) {
bc->alloc(backend, &local_err);
if (local_err) {
goto out;
}
/*获取内存起始地址*/
ptr = memory_region_get_ram_ptr(&backend->mr);
/*获取memory_region的size,这里是1G*/
sz = memory_region_size(&backend->mr);

if (backend->merge) {
qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE);
}
if (!backend->dump) {
qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP);
}
#ifdef CONFIG_NUMA /*支持numa的话会根据numa策略决定mbind到哪些numa node上*/
unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
/* lastbit == MAX_NODES means maxnode = 0 */
unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1);
/* ensure policy won't be ignored in case memory is preallocated
* before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
* this doesn't catch hugepage case. */
unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;

/* check for invalid host-nodes and policies and give more verbose
* error messages than mbind(). */
if (maxnode && backend->policy == MPOL_DEFAULT) {
error_setg(errp, "host-nodes must be empty for policy default,"
" or you should explicitly specify a policy other"
" than default");
return;
} else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
error_setg(errp, "host-nodes must be set for policy %s",
HostMemPolicy_str(backend->policy));
return;
}

/* We can have up to MAX_NODES nodes, but we need to pass maxnode+1
* as argument to mbind() due to an old Linux bug (feature?) which
* cuts off the last specified node. This means backend->host_nodes
* must have MAX_NODES+1 bits available.
*/
assert(sizeof(backend->host_nodes) >=
BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
assert(maxnode <= MAX_NODES);
if (mbind(ptr, sz, backend->policy,
maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) {
if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
error_setg_errno(errp, errno,
"cannot bind memory to host NUMA nodes");
return;
}
}
#endif
/* Preallocate memory after the NUMA policy has been instantiated.
* This is necessary to guarantee memory is allocated with
* specified NUMA policy in place.
*/
/*根据object-add命令行是否传入prealloc=on|off属性决定
是否启用预分配内存*/
if (backend->prealloc) {
/*os_mem_prealloc的实现很简单,就是启动多个线程
(线程数量根据cpu数量决定)访问刚刚申请的内存地址,
触发pagefault分配物理内存*/
os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
backend->prealloc_threads, &local_err);
if (local_err) {
goto out;
}
}
}
out:
error_propagate(errp, local_err);
}
  1. device_add接口分析
    device_add命令是热插设备的接口,内存热插时指定设备类型为pc-dimm即为热插一个内存条,热插设备调用的接口是qmp_device_add:
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp)
    {
    Error *local_err = NULL;
    QemuOpts *opts;
    DeviceState *dev;

    opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, &local_err);
    if (local_err) {
    error_propagate(errp, local_err);
    return;
    }
    if (!monitor_cur_is_qmp() && qdev_device_help(opts)) {
    qemu_opts_del(opts);
    return;
    }
    /*调用qdev_device_add热插设备*/
    dev = qdev_device_add(opts, &local_err);
    if (!dev) {
    error_propagate(errp, local_err);
    qemu_opts_del(opts);
    return;
    }
    object_unref(OBJECT(dev));
    }

qdev_device_add是执行创建设备的函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
DeviceState *qdev_device_add(QemuOpts *opts, Error **errp)
{
DeviceClass *dc;
const char *driver, *path;
DeviceState *dev = NULL;
BusState *bus = NULL;
Error *err = NULL;
bool hide;
/*查找设备driver*/
driver = qemu_opt_get(opts, "driver");
if (!driver) {
error_setg(errp, QERR_MISSING_PARAMETER, "driver");
return NULL;
}

/* find driver */
dc = qdev_get_device_class(&driver, errp);
if (!dc) {
return NULL;
}

/* find bus */
path = qemu_opt_get(opts, "bus");
if (path != NULL) {
bus = qbus_find(path, errp);
if (!bus) {
return NULL;
}
if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) {
error_setg(errp, "Device '%s' can't go on %s bus",
driver, object_get_typename(OBJECT(bus)));
return NULL;
}
} else if (dc->bus_type != NULL) {
bus = qbus_find_recursive(sysbus_get_default(), NULL, dc->bus_type);
if (!bus || qbus_is_full(bus)) {
error_setg(errp, "No '%s' bus found for device '%s'",
dc->bus_type, driver);
return NULL;
}
}
hide = should_hide_device(opts);

if ((hide || qdev_hotplug) && bus && !qbus_is_hotpluggable(bus)) {
error_setg(errp, QERR_BUS_NO_HOTPLUG, bus->name);
return NULL;
}

if (hide) {
return NULL;
}

if (!migration_is_idle()) {
error_setg(errp, "device_add not allowed while migrating");
return NULL;
}

/* create pc-dimm device */
dev = DEVICE(object_new(driver));

/* Check whether the hotplug is allowed by the machine */
if (qdev_hotplug && !qdev_hotplug_allowed(dev, &err)) {
/* Error must be set in the machine hook */
assert(err);
goto err_del_dev;
}

if (bus) {
qdev_set_parent_bus(dev, bus);
} else if (qdev_hotplug && !qdev_get_machine_hotplug_handler(dev)) {
/* No bus, no machine hotplug handler --> device is not hotpluggable */
error_setg(&err, "Device '%s' can not be hotplugged on this machine",
driver);
goto err_del_dev;
}
/*设置id为dimm1*/
qdev_set_id(dev, qemu_opts_id(opts));
/* pc-dimm设备配置了link属性memdev,memdev属性是将通过将命令行
中memdev对应的object转换为HostMemoryBackend类型的object,并赋值给
PCDIMMDevice->hostmem,这里通过set_property(最终调用 object_set_link_property)将HostMemoryBackend指针指向为名为mem1的object */
if (qemu_opt_foreach(opts, set_property, dev, &err)) {
goto err_del_dev;
}

dev->opts = opts;
/*将Device的realized属性设为true,触发调用device_set_realized*/
object_property_set_bool(OBJECT(dev), true, "realized", &err);
if (err != NULL) {
dev->opts = NULL;
goto err_del_dev;
}
return dev;
err_del_dev:
error_propagate(errp, err);
if (dev) {
object_unparent(OBJECT(dev));
object_unref(OBJECT(dev));
}
return NULL;
}

device_set_realized是真正进行内存热插拔的函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
if (value && !dev->realized) {
if (!check_only_migratable(obj, &local_err)) {
goto fail;
}

if (!obj->parent) {
gchar *name = g_strdup_printf("device[%d]", unattached_count++);

object_property_add_child(container_get(qdev_get_machine(),
"/unattached"),
name, obj, &error_abort);
unattached_parent = true;
g_free(name);
}
/*这里调用qdev_get_hotplug_handler回调得到HotplugHandler指针,以pc机器为例,mc->get_hotplug_handler回调函数设置为pc_get_hotplug_handler,调用
pc_get_hotplug_handler返回TYPE_HOTPLUG_HANDLER类型的object*/
hotplug_ctrl = qdev_get_hotplug_handler(dev);
if (hotplug_ctrl) {
/*调用pc_dimm_pre_plug分配slot并设置到pc-dimm设备的slot属性
上,同时调用get_memory_region获取mem1对应的内存地址,并设置
到pc-dimm设备的addr属性上*/
hotplug_handler_pre_plug(hotplug_ctrl, dev, &local_err);
if (local_err != NULL) {
goto fail;
}
}

if (dc->realize) {
/*调用pc_dimm_realize*/
dc->realize(dev, &local_err);
if (local_err != NULL) {
goto fail;
}
}

DEVICE_LISTENER_CALL(realize, Forward, dev);

/*
* always free/re-initialize here since the value cannot be cleaned up
* in device_unrealize due to its usage later on in the unplug path
*/
g_free(dev->canonical_path);
dev->canonical_path = object_get_canonical_path(OBJECT(dev));
/*将设备注册到vmstate上*/
if (qdev_get_vmsd(dev)) {
if (vmstate_register_with_alias_id(VMSTATE_IF(dev),
VMSTATE_INSTANCE_ID_ANY,
qdev_get_vmsd(dev), dev,
dev->instance_id_alias,
dev->alias_required_for_version,
&local_err) < 0) {
goto post_realize_fail;
}
}

/*
* Clear the reset state, in case the object was previously unrealized
* with a dirty state.
*/
resettable_state_clear(&dev->reset);

QLIST_FOREACH(bus, &dev->child_bus, sibling) {
object_property_set_bool(OBJECT(bus), true, "realized",
&local_err);
if (local_err != NULL) {
goto child_realize_fail;
}
}
if (dev->hotplugged) {
/*
* Reset the device, as well as its subtree which, at this point,
* should be realized too.
*/
resettable_assert_reset(OBJECT(dev), RESET_TYPE_COLD);
resettable_change_parent(OBJECT(dev), OBJECT(dev->parent_bus),
NULL);
resettable_release_reset(OBJECT(dev), RESET_TYPE_COLD);
}
dev->pending_deleted_event = false;

if (hotplug_ctrl) {
/*调用pc-dimm-plug将memory_region加到subregion,同时调用
acpi hotplug回调函数acpi_memory_plug_cb发送irq通知虚拟机*/
hotplug_handler_plug(hotplug_ctrl, dev, &local_err);
if (local_err != NULL) {
goto child_realize_fail;
}
}

}