gpu-compute: Adding support for LDS atomics

This changeset is adding support for LDS atomics and implementing DS_OR_B32 instruction. Change-Id: I84c5cf6ce0e9494726dc7299f360551cd2a485f5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/61791 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2022-07-15 15:58:07 -07:00
parent 2816598831
commit c6b38909e1
5 changed files with 102 additions and 2 deletions
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -34181,6 +34181,10 @@ namespace VegaISA
    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_or_b32")
    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicOr);
+        setFlag(AtomicReturn);
    } // Inst_DS__DS_OR_B32

    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
@@ -34195,8 +34199,60 @@ namespace VegaISA
    void
    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
    } // execute
+
+    void
+    Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+
    // --- Inst_DS__DS_XOR_B32 class methods ---

    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -31496,6 +31496,8 @@ namespace VegaISA
        } // getOperandSize

        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr gpuDynInst);
+        void completeAcc(GPUDynInstPtr gpuDynInst);
    }; // Inst_DS__DS_OR_B32

    class Inst_DS__DS_XOR_B32 : public Inst_DS
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -504,6 +504,27 @@ namespace VegaISA
            }
        }

+        template<typename T>
+        void
+        initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr = gpuDynInst->addr[lane] + offset;
+
+                    AtomicOpFunctorPtr amo_op =
+                        gpuDynInst->makeAtomicOpFunctor<T>(
+                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
+                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);
+
+                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
+                        = wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));
+                }
+            }
+        }
+
        void
        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
        {
--- a/src/gpu-compute/lds_state.hh
+++ b/src/gpu-compute/lds_state.hh
@@ -101,6 +101,27 @@ class LdsChunk
        *p0 = value;
    }

+    /**
+     * an atomic operation
+     */
+    template<class T>
+    T
+    atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
+    {
+        /**
+         * Atomics that are outside the bounds of the LDS
+         * chunk allocated to this WG are dropped.
+         */
+        if (index >= chunk.size()) {
+            return (T)0;
+        }
+        T *p0 = (T *) (&(chunk.at(index)));
+        T tmp = *p0;
+
+       (*amoOp)((uint8_t *)p0);
+        return tmp;
+    }
+
    /**
     * get the size of this chunk
     */
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -154,7 +154,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
    if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
         ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
         ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
-         ii->isFlatGlobal() || ii->isSleep())) {
+         ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
        panic("next instruction: %s is of unknown type\n", ii->disassemble());
    }