dev-amdgpu,mem-ruby: Add support to checkpoint and restore between kernels in GPUFS (#377)

Earlier, GPU checkpointing was working only if a checkpoint was created
before the first kernel execution. This pull request adds support to
checkpoint in-between any two kernel calls. It does so by doing the
following.

- Adds flush support in the GPU_VIPER protocol
- Adds flush support in the GPUCoalescer
- Updates cache recorder to use the GPUCoalescer during simulation
cooldown and cache warmup times.
This commit is contained in:
Matt Sinclair
2023-10-10 09:41:21 -05:00
committed by GitHub
14 changed files with 381 additions and 38 deletions

View File

@@ -1044,6 +1044,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
int num_queues = queues.size();
Addr id[num_queues];
Addr mqd_base[num_queues];
uint64_t mqd_read_index[num_queues];
Addr base[num_queues];
Addr rptr[num_queues];
Addr wptr[num_queues];
@@ -1060,6 +1061,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
uint32_t hqd_active[num_queues];
uint32_t hqd_vmid[num_queues];
Addr aql_rptr[num_queues];
uint32_t aql[num_queues];
uint32_t doorbell[num_queues];
uint32_t hqd_pq_control[num_queues];
@@ -1068,9 +1070,10 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
PM4Queue *q = iter.second;
id[i] = q->id();
mqd_base[i] = q->mqdBase();
mqd_read_index[i] = q->getMQD()->mqdReadIndex;
bool cur_state = q->ib();
q->ib(false);
base[i] = q->base() >> 8;
base[i] = q->base();
rptr[i] = q->getRptr();
wptr[i] = q->getWptr();
q->ib(true);
@@ -1088,6 +1091,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
hqd_active[i] = q->getMQD()->hqd_active;
hqd_vmid[i] = q->getMQD()->hqd_vmid;
aql_rptr[i] = q->getMQD()->aqlRptr;
aql[i] = q->getMQD()->aql;
doorbell[i] = q->getMQD()->doorbell;
hqd_pq_control[i] = q->getMQD()->hqd_pq_control;
i++;
@@ -1096,6 +1100,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
SERIALIZE_SCALAR(num_queues);
SERIALIZE_ARRAY(id, num_queues);
SERIALIZE_ARRAY(mqd_base, num_queues);
SERIALIZE_ARRAY(mqd_read_index, num_queues);
SERIALIZE_ARRAY(base, num_queues);
SERIALIZE_ARRAY(rptr, num_queues);
SERIALIZE_ARRAY(wptr, num_queues);
@@ -1112,6 +1117,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
SERIALIZE_ARRAY(hqd_active, num_queues);
SERIALIZE_ARRAY(hqd_vmid, num_queues);
SERIALIZE_ARRAY(aql_rptr, num_queues);
SERIALIZE_ARRAY(aql, num_queues);
SERIALIZE_ARRAY(doorbell, num_queues);
SERIALIZE_ARRAY(hqd_pq_control, num_queues);
}
@@ -1127,6 +1133,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
Addr id[num_queues];
Addr mqd_base[num_queues];
uint64_t mqd_read_index[num_queues];
Addr base[num_queues];
Addr rptr[num_queues];
Addr wptr[num_queues];
@@ -1143,11 +1150,13 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
uint32_t hqd_active[num_queues];
uint32_t hqd_vmid[num_queues];
Addr aql_rptr[num_queues];
uint32_t aql[num_queues];
uint32_t doorbell[num_queues];
uint32_t hqd_pq_control[num_queues];
UNSERIALIZE_ARRAY(id, num_queues);
UNSERIALIZE_ARRAY(mqd_base, num_queues);
UNSERIALIZE_ARRAY(mqd_read_index, num_queues);
UNSERIALIZE_ARRAY(base, num_queues);
UNSERIALIZE_ARRAY(rptr, num_queues);
UNSERIALIZE_ARRAY(wptr, num_queues);
@@ -1164,6 +1173,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
UNSERIALIZE_ARRAY(hqd_active, num_queues);
UNSERIALIZE_ARRAY(hqd_vmid, num_queues);
UNSERIALIZE_ARRAY(aql_rptr, num_queues);
UNSERIALIZE_ARRAY(aql, num_queues);
UNSERIALIZE_ARRAY(doorbell, num_queues);
UNSERIALIZE_ARRAY(hqd_pq_control, num_queues);
@@ -1172,22 +1182,24 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
memset(mqd, 0, sizeof(QueueDesc));
mqd->mqdBase = mqd_base[i] >> 8;
mqd->base = base[i];
mqd->rptr = rptr[i];
mqd->ibBase = ib_base[i];
mqd->ibRptr = ib_rptr[i];
mqd->mqdReadIndex = mqd_read_index[i];
mqd->base = base[i] >> 8;
mqd->aql = aql[i];
PM4MapQueues* pkt = new PM4MapQueues;
memset(pkt, 0, sizeof(PM4MapQueues));
newQueue(mqd, offset[i], pkt, id[i]);
queues[id[i]]->ib(false);
queues[id[i]]->wptr(wptr[i]);
queues[id[i]]->ib(true);
queues[id[i]]->wptr(ib_wptr[i]);
if (ib[i]) {
queues[id[i]]->wptr(ib_wptr[i]);
queues[id[i]]->rptr(ib_rptr[i]);
} else {
queues[id[i]]->rptr(rptr[i]);
queues[id[i]]->wptr(wptr[i]);
}
queues[id[i]]->ib(ib[i]);
queues[id[i]]->offset(offset[i]);
queues[id[i]]->processing(processing[i]);
queues[id[i]]->ib(ib[i]);
queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]);
queues[id[i]]->getMQD()->hqd_active = hqd_active[i];
queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i];
@@ -1195,6 +1207,14 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
queues[id[i]]->getMQD()->doorbell = doorbell[i];
queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i];
if (mqd->aql) {
int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4;
auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i],
mqd_size, 8, GfxVersion::gfx900, offset[i],
mqd_read_index[i]);
}
DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n",
queues[id[i]]->id(), queues[id[i]]->rptr(),
queues[id[i]]->wptr());