From 6a4b2bb0965a28d1428a60b867318e23511dd168 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Mon, 11 Sep 2023 09:22:26 -0500 Subject: [PATCH] dev-hsa,gpu-compute: Add timestamps to AMD HSA signals The AMD specific HSA signal contains start/end timestamps for dispatch packet completion signals. These are current always zero. These timestamp values are used for profiling in the ROCr runtime. Unfortunately, the GpuAgent::TranslateTime method in ROCr does not check for zero values before dividing, causing applications that use profiling to crash with SIGFPE. Profiling is used via hipEvents in the HACC application, so these should be supported in gem5. In order to handle writing the timestamp values, we need to DMA the values to memory before writing the completion signal. This changes the flow of the async completion signal write to be (1) read mailbox pointer (2) if valid, write the mailbox data, other skip to 4 (3) write mailbox data if pointer is valid (4) write timestamp values (5) write completion signal. The application will process the timestamp data as soon as the completion signal is received, so we need to ordering to ensure the DMA for timestamps was completed. HACC now runs to completion on GPUFS and has the same output was hardware. Change-Id: I09877cdff901d1402140f2c3bafea7605fa6554e --- src/dev/hsa/hsa_signal.hh | 6 ++ src/gpu-compute/gpu_command_processor.cc | 88 +++++++++++++++++------- src/gpu-compute/gpu_command_processor.hh | 4 ++ 3 files changed, 74 insertions(+), 24 deletions(-) diff --git a/src/dev/hsa/hsa_signal.hh b/src/dev/hsa/hsa_signal.hh index 6acbcb7e1b..7d1f316f04 100644 --- a/src/dev/hsa/hsa_signal.hh +++ b/src/dev/hsa/hsa_signal.hh @@ -69,6 +69,12 @@ typedef struct amd_signal_s uint32_t reserved3[2]; } amd_signal_t; +typedef struct +{ + uint64_t start_ts; + uint64_t end_ts; +} amd_event_t; + } // namespace gem5 #endif // DEV_HSA_HSA_SIGNAL_H diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index db69ad5cbd..ecc5f1d98b 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -248,6 +248,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, initABI(task); ++dynamic_task_id; + + // The driver expects the start time to be in ns + Tick start_ts = curTick() / sim_clock::as_int::ns; + dispatchStartTime.insert({disp_pkt->completion_signal, start_ts}); } void @@ -280,16 +284,6 @@ GPUCommandProcessor::sendCompletionSignal(Addr signal_handle) void GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff) { - Addr value_addr = getHsaSignalValueAddr(signal_handle); - - uint64_t *signalValue = new uint64_t; - auto cb = new DmaVirtCallback( - [ = ] (const uint64_t &) - { updateHsaSignalData(value_addr, diff, signalValue); }); - dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue); - DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n", - value_addr); - Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle); uint64_t *mailboxValue = new uint64_t; auto cb2 = new DmaVirtCallback( @@ -300,20 +294,6 @@ GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff) mailbox_addr); } -void -GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff, - uint64_t *prev_value) -{ - // Reuse the value allocated for the read - DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n", - *prev_value, *prev_value + diff); - *prev_value += diff; - auto cb = new DmaVirtCallback( - [ = ] (const uint64_t &) - { updateHsaSignalDone(prev_value); }); - dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value); -} - void GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value) @@ -331,6 +311,20 @@ GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle, dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value); } else { delete mailbox_value; + + Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts); + + amd_event_t *event_ts = new amd_event_t; + event_ts->start_ts = dispatchStartTime[signal_handle]; + event_ts->end_ts = curTick() / sim_clock::as_int::ns; + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaEventTs(signal_handle, event_ts); }); + dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb, (void *)event_ts); + DPRINTF(GPUCommandProc, "updateHsaMailboxData reading timestamp addr " + "%lx\n", ts_addr); + + dispatchStartTime.erase(signal_handle); } } @@ -346,6 +340,52 @@ GPUCommandProcessor::updateHsaEventData(Addr signal_handle, [ = ] (const uint64_t &) { updateHsaSignalDone(event_value); }, *event_value); dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0); + + Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts); + + amd_event_t *event_ts = new amd_event_t; + event_ts->start_ts = dispatchStartTime[signal_handle]; + event_ts->end_ts = curTick() / sim_clock::as_int::ns; + auto cb2 = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaEventTs(signal_handle, event_ts); }); + dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb2, (void *)event_ts); + DPRINTF(GPUCommandProc, "updateHsaEventData reading timestamp addr %lx\n", + ts_addr); + + dispatchStartTime.erase(signal_handle); +} + +void +GPUCommandProcessor::updateHsaEventTs(Addr signal_handle, + amd_event_t *ts) +{ + delete ts; + + Addr value_addr = getHsaSignalValueAddr(signal_handle); + int64_t diff = -1; + + uint64_t *signalValue = new uint64_t; + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalData(value_addr, diff, signalValue); }); + dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue); + DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n", + value_addr); +} + +void +GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff, + uint64_t *prev_value) +{ + // Reuse the value allocated for the read + DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n", + *prev_value, *prev_value + diff); + *prev_value += diff; + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalDone(prev_value); }); + dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value); } void diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh index 10407b9f93..f6783834eb 100644 --- a/src/gpu-compute/gpu_command_processor.hh +++ b/src/gpu-compute/gpu_command_processor.hh @@ -117,6 +117,7 @@ class GPUCommandProcessor : public DmaVirtDevice void updateHsaSignalDone(uint64_t *signal_value); void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value); void updateHsaEventData(Addr signal_handle, uint64_t *event_value); + void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value); uint64_t functionalReadHsaSignal(Addr signal_handle); @@ -148,6 +149,9 @@ class GPUCommandProcessor : public DmaVirtDevice HSAPacketProcessor *hsaPP; TranslationGenPtr translate(Addr vaddr, Addr size) override; + // Keep track of start times for task dispatches. + std::unordered_map dispatchStartTime; + /** * Perform a DMA read of the read_dispatch_id_field_base_byte_offset * field, which follows directly after the read_dispatch_id (the read