From 57b3d2897c77d0907016ee972655c7bbcdfe23e7 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 25 Aug 2023 09:35:55 -0500 Subject: [PATCH] gpu-compute: Use timing DMAs for GPUFS HSA signals The functional HSA signal read was a hack left in the gpu-compute code. In full system, this functional read is causing problems occasionally with the translation not yet being in the page table. The error message output by gem5 was a fatal message on the readBlob method in port proxy. Changing this to a timing DMA fixes this problem. This commit adds the various timing DMA functions to send and receive response and clean up. A helper method "sendCompletionSignal" is added to the GPUCommandProcessor because the indentation level was getting too deep. This change applies only to FS mode. Code for SE mode is equivalent to what it was before this commit. Change-Id: I1bfcaa0a52731cdf9532a7fd0eb06ab2f0e09d48 --- src/dev/hsa/hsa_packet_processor.cc | 16 ++-- src/gpu-compute/dispatcher.cc | 12 +-- src/gpu-compute/gpu_command_processor.cc | 104 +++++++++++++++++++++++ src/gpu-compute/gpu_command_processor.hh | 7 ++ 4 files changed, 118 insertions(+), 21 deletions(-) diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index d0afcf816f..2064de41ce 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -389,20 +389,16 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr) dep_sgnl_rd_st->resetSigVals(); // The completion signal is connected if (bar_and_pkt->completion_signal != 0) { - // HACK: The semantics of the HSA signal is to - // decrement the current signal value - // I'm going to cheat here and read out - // the value from main memory using functional - // access, and then just DMA the decremented value. - uint64_t signal_value = gpu_device->functionalReadHsaSignal(\ - bar_and_pkt->completion_signal); - + // The semantics of the HSA signal is to decrement the current + // signal value by one. Do this asynchronously via DMAs and + // callbacks as we can safely continue with this function + // while waiting for the next packet from the host. DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \ " completion signal! Addr: %x\n", bar_and_pkt->completion_signal); - gpu_device->updateHsaSignal(bar_and_pkt->completion_signal, - signal_value - 1); + gpu_device->sendCompletionSignal( + bar_and_pkt->completion_signal); } } if (dep_sgnl_rd_st->pendingReads > 0) { diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index babc938489..8a72fd73f4 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -310,20 +310,10 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf) gpuCmdProc->hsaPacketProc() .finishPkt(task->dispPktPtr(), task->queueId()); if (task->completionSignal()) { - /** - * HACK: The semantics of the HSA signal is to decrement - * the current signal value. We cheat here and read out - * he value from main memory using functional access and - * then just DMA the decremented value. - */ - uint64_t signal_value = - gpuCmdProc->functionalReadHsaSignal(task->completionSignal()); - DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion " "signal! Addr: %d\n", task->completionSignal()); - gpuCmdProc->updateHsaSignal(task->completionSignal(), - signal_value - 1); + gpuCmdProc->sendCompletionSignal(task->completionSignal()); } else { DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion " "signal\n"); diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index 8f748bdc31..db69ad5cbd 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -250,6 +250,110 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, ++dynamic_task_id; } +void +GPUCommandProcessor::sendCompletionSignal(Addr signal_handle) +{ + // Originally the completion signal was read functionally and written + // with a timing DMA. This can cause issues in FullSystem mode and + // cause translation failures. Therefore, in FullSystem mode everything + // is done in timing mode. + + if (!FullSystem) { + /** + * HACK: The semantics of the HSA signal is to decrement + * the current signal value. We cheat here and read out + * he value from main memory using functional access and + * then just DMA the decremented value. + */ + uint64_t signal_value = functionalReadHsaSignal(signal_handle); + + updateHsaSignal(signal_handle, signal_value - 1); + } else { + // The semantics of the HSA signal is to decrement the current + // signal value by one. Do this asynchronously via DMAs and + // callbacks as we can safely continue with this function + // while waiting for the next packet from the host. + updateHsaSignalAsync(signal_handle, -1); + } +} + +void +GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff) +{ + Addr value_addr = getHsaSignalValueAddr(signal_handle); + + uint64_t *signalValue = new uint64_t; + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalData(value_addr, diff, signalValue); }); + dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue); + DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n", + value_addr); + + Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle); + uint64_t *mailboxValue = new uint64_t; + auto cb2 = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaMailboxData(signal_handle, mailboxValue); }); + dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue); + DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n", + mailbox_addr); +} + +void +GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff, + uint64_t *prev_value) +{ + // Reuse the value allocated for the read + DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n", + *prev_value, *prev_value + diff); + *prev_value += diff; + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalDone(prev_value); }); + dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value); +} + +void +GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle, + uint64_t *mailbox_value) +{ + Addr event_addr = getHsaSignalEventAddr(signal_handle); + + DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value); + if (*mailbox_value != 0) { + // This is an interruptible signal. Now, read the + // event ID and directly communicate with the driver + // about that event notification. + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaEventData(signal_handle, mailbox_value); }); + dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value); + } else { + delete mailbox_value; + } +} + +void +GPUCommandProcessor::updateHsaEventData(Addr signal_handle, + uint64_t *event_value) +{ + Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle); + + DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value); + // Write *event_value to the mailbox to clear the event + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalDone(event_value); }, *event_value); + dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0); +} + +void +GPUCommandProcessor::updateHsaSignalDone(uint64_t *signal_value) +{ + delete signal_value; +} + uint64_t GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle) { diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh index bafe733ee1..d2ddf5c78f 100644 --- a/src/gpu-compute/gpu_command_processor.hh +++ b/src/gpu-compute/gpu_command_processor.hh @@ -106,9 +106,16 @@ class GPUCommandProcessor : public DmaVirtDevice AddrRangeList getAddrRanges() const override; System *system(); + void sendCompletionSignal(Addr signal_handle); void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function = [] (const uint64_t &) { }); + void updateHsaSignalAsync(Addr signal_handle, int64_t diff); + void updateHsaSignalData(Addr value_addr, int64_t diff, + uint64_t *prev_value); + void updateHsaSignalDone(uint64_t *signal_value); + void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value); + void updateHsaEventData(Addr signal_handle, uint64_t *event_value); uint64_t functionalReadHsaSignal(Addr signal_handle);