gpu-compute: Use timing DMAs for GPUFS HSA signals

The functional HSA signal read was a hack left in the gpu-compute code. In full system, this functional read is causing problems occasionally with the translation not yet being in the page table. The error message output by gem5 was a fatal message on the readBlob method in port proxy. Changing this to a timing DMA fixes this problem. This commit adds the various timing DMA functions to send and receive response and clean up. A helper method "sendCompletionSignal" is added to the GPUCommandProcessor because the indentation level was getting too deep. This change applies only to FS mode. Code for SE mode is equivalent to what it was before this commit. Change-Id: I1bfcaa0a52731cdf9532a7fd0eb06ab2f0e09d48
2023-08-25 09:35:55 -05:00
parent 5cb604559a
commit 57b3d2897c
4 changed files with 118 additions and 21 deletions
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -389,20 +389,16 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
            dep_sgnl_rd_st->resetSigVals();
            // The completion signal is connected
            if (bar_and_pkt->completion_signal != 0) {
-                // HACK: The semantics of the HSA signal is to
-                // decrement the current signal value
-                // I'm going to cheat here and read out
-                // the value from main memory using functional
-                // access, and then just DMA the decremented value.
-                uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
-                                            bar_and_pkt->completion_signal);
-
+                // The semantics of the HSA signal is to decrement the current
+                // signal value by one. Do this asynchronously via DMAs and
+                // callbacks as we can safely continue with this function
+                // while waiting for the next packet from the host.
                DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
                       " completion signal! Addr: %x\n",
                       bar_and_pkt->completion_signal);

-                gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
-                                            signal_value - 1);
+                gpu_device->sendCompletionSignal(
+                    bar_and_pkt->completion_signal);
            }
        }
        if (dep_sgnl_rd_st->pendingReads > 0) {
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -310,20 +310,10 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
        gpuCmdProc->hsaPacketProc()
            .finishPkt(task->dispPktPtr(), task->queueId());
        if (task->completionSignal()) {
-            /**
-            * HACK: The semantics of the HSA signal is to decrement
-            * the current signal value. We cheat here and read out
-            * he value from main memory using functional access and
-            * then just DMA the decremented value.
-            */
-            uint64_t signal_value =
-                gpuCmdProc->functionalReadHsaSignal(task->completionSignal());
-
            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
                    "signal! Addr: %d\n", task->completionSignal());

-            gpuCmdProc->updateHsaSignal(task->completionSignal(),
-                                        signal_value - 1);
+            gpuCmdProc->sendCompletionSignal(task->completionSignal());
        } else {
            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
                "signal\n");
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -250,6 +250,110 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
    ++dynamic_task_id;
 }

+void
+GPUCommandProcessor::sendCompletionSignal(Addr signal_handle)
+{
+    // Originally the completion signal was read functionally and written
+    // with a timing DMA. This can cause issues in FullSystem mode and
+    // cause translation failures. Therefore, in FullSystem mode everything
+    // is done in timing mode.
+
+    if (!FullSystem) {
+        /**
+        * HACK: The semantics of the HSA signal is to decrement
+        * the current signal value. We cheat here and read out
+        * he value from main memory using functional access and
+        * then just DMA the decremented value.
+        */
+        uint64_t signal_value = functionalReadHsaSignal(signal_handle);
+
+        updateHsaSignal(signal_handle, signal_value - 1);
+    } else {
+        // The semantics of the HSA signal is to decrement the current
+        // signal value by one. Do this asynchronously via DMAs and
+        // callbacks as we can safely continue with this function
+        // while waiting for the next packet from the host.
+        updateHsaSignalAsync(signal_handle, -1);
+    }
+}
+
+void
+GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff)
+{
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+
+    uint64_t *signalValue = new uint64_t;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalData(value_addr, diff, signalValue); });
+    dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
+    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
+            value_addr);
+
+    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
+    uint64_t *mailboxValue = new uint64_t;
+    auto cb2 = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaMailboxData(signal_handle, mailboxValue); });
+    dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue);
+    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n",
+            mailbox_addr);
+}
+
+void
+GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff,
+                                         uint64_t *prev_value)
+{
+    // Reuse the value allocated for the read
+    DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
+            *prev_value, *prev_value + diff);
+    *prev_value += diff;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalDone(prev_value); });
+    dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
+}
+
+void
+GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle,
+                                          uint64_t *mailbox_value)
+{
+    Addr event_addr = getHsaSignalEventAddr(signal_handle);
+
+    DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value);
+    if (*mailbox_value != 0) {
+        // This is an interruptible signal. Now, read the
+        // event ID and directly communicate with the driver
+        // about that event notification.
+        auto cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { updateHsaEventData(signal_handle, mailbox_value); });
+        dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);
+    } else {
+        delete mailbox_value;
+    }
+}
+
+void
+GPUCommandProcessor::updateHsaEventData(Addr signal_handle,
+                                        uint64_t *event_value)
+{
+    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
+
+    DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value);
+    // Write *event_value to the mailbox to clear the event
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalDone(event_value); }, *event_value);
+    dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
+}
+
+void
+GPUCommandProcessor::updateHsaSignalDone(uint64_t *signal_value)
+{
+    delete signal_value;
+}
+
 uint64_t
 GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
 {
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -106,9 +106,16 @@ class GPUCommandProcessor : public DmaVirtDevice
    AddrRangeList getAddrRanges() const override;
    System *system();

+    void sendCompletionSignal(Addr signal_handle);
    void updateHsaSignal(Addr signal_handle, uint64_t signal_value,
                         HsaSignalCallbackFunction function =
                            [] (const uint64_t &) { });
+    void updateHsaSignalAsync(Addr signal_handle, int64_t diff);
+    void updateHsaSignalData(Addr value_addr, int64_t diff,
+                             uint64_t *prev_value);
+    void updateHsaSignalDone(uint64_t *signal_value);
+    void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value);
+    void updateHsaEventData(Addr signal_handle, uint64_t *event_value);

    uint64_t functionalReadHsaSignal(Addr signal_handle);