diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index d0afcf816f..2064de41ce 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -389,20 +389,16 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
             dep_sgnl_rd_st->resetSigVals();
             // The completion signal is connected
             if (bar_and_pkt->completion_signal != 0) {
-                // HACK: The semantics of the HSA signal is to
-                // decrement the current signal value
-                // I'm going to cheat here and read out
-                // the value from main memory using functional
-                // access, and then just DMA the decremented value.
-                uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
-                                            bar_and_pkt->completion_signal);
-
+                // The semantics of the HSA signal is to decrement the current
+                // signal value by one. Do this asynchronously via DMAs and
+                // callbacks as we can safely continue with this function
+                // while waiting for the next packet from the host.
                 DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
                        " completion signal! Addr: %x\n",
                        bar_and_pkt->completion_signal);
 
-                gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
-                                            signal_value - 1);
+                gpu_device->sendCompletionSignal(
+                    bar_and_pkt->completion_signal);
             }
         }
         if (dep_sgnl_rd_st->pendingReads > 0) {
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index babc938489..8a72fd73f4 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -310,20 +310,10 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
         gpuCmdProc->hsaPacketProc()
             .finishPkt(task->dispPktPtr(), task->queueId());
         if (task->completionSignal()) {
-            /**
-            * HACK: The semantics of the HSA signal is to decrement
-            * the current signal value. We cheat here and read out
-            * he value from main memory using functional access and
-            * then just DMA the decremented value.
-            */
-            uint64_t signal_value =
-                gpuCmdProc->functionalReadHsaSignal(task->completionSignal());
-
             DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
                     "signal! Addr: %d\n", task->completionSignal());
 
-            gpuCmdProc->updateHsaSignal(task->completionSignal(),
-                                        signal_value - 1);
+            gpuCmdProc->sendCompletionSignal(task->completionSignal());
         } else {
             DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
                 "signal\n");
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 8f748bdc31..db69ad5cbd 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -250,6 +250,110 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     ++dynamic_task_id;
 }
 
+void
+GPUCommandProcessor::sendCompletionSignal(Addr signal_handle)
+{
+    // Originally the completion signal was read functionally and written
+    // with a timing DMA. This can cause issues in FullSystem mode and
+    // cause translation failures. Therefore, in FullSystem mode everything
+    // is done in timing mode.
+
+    if (!FullSystem) {
+        /**
+        * HACK: The semantics of the HSA signal is to decrement
+        * the current signal value. We cheat here and read out
+        * he value from main memory using functional access and
+        * then just DMA the decremented value.
+        */
+        uint64_t signal_value = functionalReadHsaSignal(signal_handle);
+
+        updateHsaSignal(signal_handle, signal_value - 1);
+    } else {
+        // The semantics of the HSA signal is to decrement the current
+        // signal value by one. Do this asynchronously via DMAs and
+        // callbacks as we can safely continue with this function
+        // while waiting for the next packet from the host.
+        updateHsaSignalAsync(signal_handle, -1);
+    }
+}
+
+void
+GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff)
+{
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+
+    uint64_t *signalValue = new uint64_t;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalData(value_addr, diff, signalValue); });
+    dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
+    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
+            value_addr);
+
+    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
+    uint64_t *mailboxValue = new uint64_t;
+    auto cb2 = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaMailboxData(signal_handle, mailboxValue); });
+    dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue);
+    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n",
+            mailbox_addr);
+}
+
+void
+GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff,
+                                         uint64_t *prev_value)
+{
+    // Reuse the value allocated for the read
+    DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
+            *prev_value, *prev_value + diff);
+    *prev_value += diff;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalDone(prev_value); });
+    dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
+}
+
+void
+GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle,
+                                          uint64_t *mailbox_value)
+{
+    Addr event_addr = getHsaSignalEventAddr(signal_handle);
+
+    DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value);
+    if (*mailbox_value != 0) {
+        // This is an interruptible signal. Now, read the
+        // event ID and directly communicate with the driver
+        // about that event notification.
+        auto cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { updateHsaEventData(signal_handle, mailbox_value); });
+        dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);
+    } else {
+        delete mailbox_value;
+    }
+}
+
+void
+GPUCommandProcessor::updateHsaEventData(Addr signal_handle,
+                                        uint64_t *event_value)
+{
+    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
+
+    DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value);
+    // Write *event_value to the mailbox to clear the event
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalDone(event_value); }, *event_value);
+    dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
+}
+
+void
+GPUCommandProcessor::updateHsaSignalDone(uint64_t *signal_value)
+{
+    delete signal_value;
+}
+
 uint64_t
 GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
 {
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index bafe733ee1..d2ddf5c78f 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -106,9 +106,16 @@ class GPUCommandProcessor : public DmaVirtDevice
     AddrRangeList getAddrRanges() const override;
     System *system();
 
+    void sendCompletionSignal(Addr signal_handle);
     void updateHsaSignal(Addr signal_handle, uint64_t signal_value,
                          HsaSignalCallbackFunction function =
                             [] (const uint64_t &) { });
+    void updateHsaSignalAsync(Addr signal_handle, int64_t diff);
+    void updateHsaSignalData(Addr value_addr, int64_t diff,
+                             uint64_t *prev_value);
+    void updateHsaSignalDone(uint64_t *signal_value);
+    void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value);
+    void updateHsaEventData(Addr signal_handle, uint64_t *event_value);
 
     uint64_t functionalReadHsaSignal(Addr signal_handle);