dev-amdgpu: Track outstanding chunks in mem manager

Requests sent using the GPU memory manager are not guaranteed to be ordered. As a result, the last chunk created by the chunk generator could complete before all of the previous chunks are done. This will trigger the final callback and may cause an SDMA/PM4/etc. packet that is waiting for its completion to resume before the data is ready. This is likely a fix for verification failures in many applications. Currently this is tested on MatrixTranspose from the HIP cookbook which now passes its verification step. It could also potentially fix other race conditions between reads/writes from/to memory such as using a PTE or PDE before it is written, etc. Change-Id: Id6fb342d899db6bd0b86c80056ecf91eeb3026f5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/62714 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
2022-08-25 16:03:17 -07:00
parent 432329c853
commit 404aa34855
2 changed files with 73 additions and 18 deletions
--- a/src/dev/amdgpu/memory_manager.hh
+++ b/src/dev/amdgpu/memory_manager.hh
@@ -33,6 +33,7 @@
 #define __DEV_AMDGPU_MEMORY_MANAGER_HH__

 #include <deque>
+#include <unordered_map>

 #include "base/callback.hh"
 #include "mem/port.hh"
@@ -46,9 +47,9 @@ class AMDGPUMemoryManager : public ClockedObject
 {
    class GPUMemPort : public MasterPort
    {
-        public:
-        GPUMemPort(const std::string &_name, AMDGPUMemoryManager *_gpuMemMgr)
-            : MasterPort(_name, _gpuMemMgr)
+      public:
+        GPUMemPort(const std::string &_name, AMDGPUMemoryManager &_gpuMemMgr)
+            : MasterPort(_name, &_gpuMemMgr), gpu_mem(_gpuMemMgr)
        {
        }

@@ -57,21 +58,35 @@ class AMDGPUMemoryManager : public ClockedObject

        struct SenderState : public Packet::SenderState
        {
-            SenderState(Event *callback, Addr addr)
-                : _callback(callback), _addr(addr)
+            SenderState(Event *callback, Addr addr, uint64_t requestId)
+                : _callback(callback), _addr(addr), _requestId(requestId)
            {}

            Event *_callback;
            Addr _addr;
+            uint64_t _requestId;
        };

        std::deque<PacketPtr> retries;
+        AMDGPUMemoryManager &gpu_mem;
    };

    GPUMemPort _gpuMemPort;
    const int cacheLineSize;
    const RequestorID _requestorId;

+    struct RequestStatus
+    {
+        RequestStatus() : outstandingChunks(0), sentLastChunk(false)
+        { }
+
+        uint64_t outstandingChunks;
+        bool sentLastChunk;
+    };
+
+    uint64_t requestId = 0;
+    std::unordered_map<uint64_t, RequestStatus> requestStatus;
+
  public:
    AMDGPUMemoryManager(const AMDGPUMemoryManagerParams &p);
    ~AMDGPUMemoryManager() {};