dev-amdgpu: Add VMID map to checkpoint

When restoring checkpoints for certain applications, gem5 tries to
create new doorbells with a pre-existing queue ID and simulation crashes
shortly after. This commit checkpoints the existing VMID map so that any
new doorbells after restoration use a unique queue ID

Change-Id: I9bf89a2769db26ceab4441634ff2da936eea6d6f
This commit is contained in:
Vishnu Ramadas
2023-11-16 17:12:01 -06:00
parent d19d6fc31e
commit 06161ded8c
2 changed files with 61 additions and 2 deletions

View File

@@ -678,10 +678,13 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
uint64_t regs_size = regs.size();
uint64_t doorbells_size = doorbells.size();
uint64_t sdma_engs_size = sdmaEngs.size();
uint64_t used_vmid_map_size = usedVMIDs.size();
SERIALIZE_SCALAR(regs_size);
SERIALIZE_SCALAR(doorbells_size);
SERIALIZE_SCALAR(sdma_engs_size);
// Save the number of vmids used
SERIALIZE_SCALAR(used_vmid_map_size);
// Make a c-style array of the regs to serialize
uint32_t reg_addrs[regs_size];
@@ -690,6 +693,9 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
QueueType doorbells_queues[doorbells_size];
uint32_t sdma_engs_offset[sdma_engs_size];
int sdma_engs[sdma_engs_size];
int used_vmids[used_vmid_map_size];
int used_queue_id_sizes[used_vmid_map_size];
std::vector<int> used_vmid_sets;
int idx = 0;
for (auto & it : regs) {
@@ -712,6 +718,20 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
++idx;
}
idx = 0;
for (auto & it : usedVMIDs) {
used_vmids[idx] = it.first;
used_queue_id_sizes[idx] = it.second.size();
std::vector<int> set_vector(it.second.begin(), it.second.end());
used_vmid_sets.insert(used_vmid_sets.end(),
set_vector.begin(), set_vector.end());
++idx;
}
int num_queue_id = used_vmid_sets.size();
int* vmid_array = new int[num_queue_id];
std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);
SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0]));
SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
@@ -721,10 +741,21 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/
sizeof(sdma_engs_offset[0]));
SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
// Save the vmids used in an array
SERIALIZE_ARRAY(used_vmids, sizeof(used_vmids)/sizeof(used_vmids[0]));
// Save the size of the set of queue ids mapped to each vmid
SERIALIZE_ARRAY(used_queue_id_sizes,
sizeof(used_queue_id_sizes)/sizeof(used_queue_id_sizes[0]));
// Save all the queue ids used for all the vmids
SERIALIZE_ARRAY(vmid_array, num_queue_id);
// Save the total number of queue idsused
SERIALIZE_SCALAR(num_queue_id);
// Serialize the device memory
deviceMem.serializeSection(cp, "deviceMem");
gpuvm.serializeSection(cp, "GPUVM");
delete[] vmid_array;
}
void
@@ -736,10 +767,13 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
uint64_t regs_size = 0;
uint64_t doorbells_size = 0;
uint64_t sdma_engs_size = 0;
uint64_t used_vmid_map_size = 0;
UNSERIALIZE_SCALAR(regs_size);
UNSERIALIZE_SCALAR(doorbells_size);
UNSERIALIZE_SCALAR(sdma_engs_size);
UNSERIALIZE_SCALAR(used_vmid_map_size);
if (regs_size > 0) {
uint32_t reg_addrs[regs_size];
@@ -786,6 +820,33 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
}
}
if (used_vmid_map_size > 0) {
int used_vmids[used_vmid_map_size];
int used_queue_id_sizes[used_vmid_map_size];
int num_queue_id = 0;
std::vector<int> used_vmid_sets;
// Extract the total number of queue ids used
UNSERIALIZE_SCALAR(num_queue_id);
int* vmid_array = new int[num_queue_id];
// Extract the number of vmids used
UNSERIALIZE_ARRAY(used_vmids, used_vmid_map_size);
// Extract the size of the queue id set for each vmid
UNSERIALIZE_ARRAY(used_queue_id_sizes, used_vmid_map_size);
// Extract all the queue ids used
UNSERIALIZE_ARRAY(vmid_array, num_queue_id);
// Populate the usedVMIDs map with the queue ids per vm
int idx = 0;
for (int it = 0; it < used_vmid_map_size; it++) {
int vmid = used_vmids[it];
int vmid_set_size = used_queue_id_sizes[it];
for (int j = 0; j < vmid_set_size; j++) {
usedVMIDs[vmid].insert(vmid_array[idx + j]);
}
idx += vmid_set_size;
}
delete[] vmid_array;
}
// Unserialize the device memory
deviceMem.unserializeSection(cp, "deviceMem");
gpuvm.unserializeSection(cp, "GPUVM");