diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index d0afcf816f..2064de41ce 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -389,20 +389,16 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr) dep_sgnl_rd_st->resetSigVals(); // The completion signal is connected if (bar_and_pkt->completion_signal != 0) { - // HACK: The semantics of the HSA signal is to - // decrement the current signal value - // I'm going to cheat here and read out - // the value from main memory using functional - // access, and then just DMA the decremented value. - uint64_t signal_value = gpu_device->functionalReadHsaSignal(\ - bar_and_pkt->completion_signal); - + // The semantics of the HSA signal is to decrement the current + // signal value by one. Do this asynchronously via DMAs and + // callbacks as we can safely continue with this function + // while waiting for the next packet from the host. DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \ " completion signal! Addr: %x\n", bar_and_pkt->completion_signal); - gpu_device->updateHsaSignal(bar_and_pkt->completion_signal, - signal_value - 1); + gpu_device->sendCompletionSignal( + bar_and_pkt->completion_signal); } } if (dep_sgnl_rd_st->pendingReads > 0) { diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index babc938489..8a72fd73f4 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -310,20 +310,10 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf) gpuCmdProc->hsaPacketProc() .finishPkt(task->dispPktPtr(), task->queueId()); if (task->completionSignal()) { - /** - * HACK: The semantics of the HSA signal is to decrement - * the current signal value. We cheat here and read out - * he value from main memory using functional access and - * then just DMA the decremented value. - */ - uint64_t signal_value = - gpuCmdProc->functionalReadHsaSignal(task->completionSignal()); - DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion " "signal! Addr: %d\n", task->completionSignal()); - gpuCmdProc->updateHsaSignal(task->completionSignal(), - signal_value - 1); + gpuCmdProc->sendCompletionSignal(task->completionSignal()); } else { DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion " "signal\n"); diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index 8f748bdc31..db69ad5cbd 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -250,6 +250,110 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, ++dynamic_task_id; } +void +GPUCommandProcessor::sendCompletionSignal(Addr signal_handle) +{ + // Originally the completion signal was read functionally and written + // with a timing DMA. This can cause issues in FullSystem mode and + // cause translation failures. Therefore, in FullSystem mode everything + // is done in timing mode. + + if (!FullSystem) { + /** + * HACK: The semantics of the HSA signal is to decrement + * the current signal value. We cheat here and read out + * he value from main memory using functional access and + * then just DMA the decremented value. + */ + uint64_t signal_value = functionalReadHsaSignal(signal_handle); + + updateHsaSignal(signal_handle, signal_value - 1); + } else { + // The semantics of the HSA signal is to decrement the current + // signal value by one. Do this asynchronously via DMAs and + // callbacks as we can safely continue with this function + // while waiting for the next packet from the host. + updateHsaSignalAsync(signal_handle, -1); + } +} + +void +GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff) +{ + Addr value_addr = getHsaSignalValueAddr(signal_handle); + + uint64_t *signalValue = new uint64_t; + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalData(value_addr, diff, signalValue); }); + dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue); + DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n", + value_addr); + + Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle); + uint64_t *mailboxValue = new uint64_t; + auto cb2 = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaMailboxData(signal_handle, mailboxValue); }); + dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue); + DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n", + mailbox_addr); +} + +void +GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff, + uint64_t *prev_value) +{ + // Reuse the value allocated for the read + DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n", + *prev_value, *prev_value + diff); + *prev_value += diff; + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalDone(prev_value); }); + dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value); +} + +void +GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle, + uint64_t *mailbox_value) +{ + Addr event_addr = getHsaSignalEventAddr(signal_handle); + + DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value); + if (*mailbox_value != 0) { + // This is an interruptible signal. Now, read the + // event ID and directly communicate with the driver + // about that event notification. + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaEventData(signal_handle, mailbox_value); }); + dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value); + } else { + delete mailbox_value; + } +} + +void +GPUCommandProcessor::updateHsaEventData(Addr signal_handle, + uint64_t *event_value) +{ + Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle); + + DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value); + // Write *event_value to the mailbox to clear the event + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { updateHsaSignalDone(event_value); }, *event_value); + dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0); +} + +void +GPUCommandProcessor::updateHsaSignalDone(uint64_t *signal_value) +{ + delete signal_value; +} + uint64_t GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle) { diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh index bafe733ee1..d2ddf5c78f 100644 --- a/src/gpu-compute/gpu_command_processor.hh +++ b/src/gpu-compute/gpu_command_processor.hh @@ -106,9 +106,16 @@ class GPUCommandProcessor : public DmaVirtDevice AddrRangeList getAddrRanges() const override; System *system(); + void sendCompletionSignal(Addr signal_handle); void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function = [] (const uint64_t &) { }); + void updateHsaSignalAsync(Addr signal_handle, int64_t diff); + void updateHsaSignalData(Addr value_addr, int64_t diff, + uint64_t *prev_value); + void updateHsaSignalDone(uint64_t *signal_value); + void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value); + void updateHsaEventData(Addr signal_handle, uint64_t *event_value); uint64_t functionalReadHsaSignal(Addr signal_handle);