arch-gcn3: Implement LDS accesses in Flat instructions

Add support for LDS accesses by allowing Flat instructions to dispatch into the local memory pipeline if the requested address is in the group aperture. This requires implementing LDS accesses in the Flat initMemRead/Write functions, in a similar fashion to the DS functions of the same name. Because we now can potentially dispatch to the local memory pipeline, this change also adds a check to regain any tokens we requested as a flat instruction. Change-Id: Id26191f7ee43291a5e5ca5f39af06af981ec23ab Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48343 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Matthew Poremba <matthew.poremba@amd.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2021-07-20 14:41:17 -05:00
parent 523a92f7f0
commit 9a7fc4ff69
4 changed files with 184 additions and 32 deletions
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -36314,7 +36314,7 @@ namespace Gcn3ISA
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -36363,7 +36363,7 @@ namespace Gcn3ISA
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }
    void
@@ -39384,8 +39384,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    } // execute

@@ -39448,8 +39451,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -39511,8 +39517,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -39603,8 +39612,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -39667,8 +39679,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -39731,8 +39746,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -39804,8 +39822,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -39889,8 +39910,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    } // execute

@@ -39952,8 +39976,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40015,8 +40042,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40079,8 +40109,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40151,8 +40184,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40227,8 +40263,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40294,8 +40333,11 @@ namespace Gcn3ISA
                     "Flats to private aperture not tested yet\n");
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }

        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
@@ -40408,8 +40450,11 @@ namespace Gcn3ISA
                     "Flats to private aperture not tested yet\n");
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40492,8 +40537,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40576,8 +40624,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }
    void
@@ -40834,8 +40885,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -40918,8 +40972,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -41044,8 +41101,11 @@ namespace Gcn3ISA
                     "Flats to private aperture not tested yet\n");
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -41129,8 +41189,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -41215,8 +41278,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -41483,8 +41549,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

@@ -41570,8 +41639,11 @@ namespace Gcn3ISA
        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
        } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
        }
    }

--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -799,35 +799,107 @@ namespace Gcn3ISA
        void
        initMemRead(GPUDynInstPtr gpuDynInst)
        {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
+                            = wf->ldsChunk->read<T>(vaddr);
+                    }
+                }
+            }
        }

        template<int N>
        void
        initMemRead(GPUDynInstPtr gpuDynInst)
        {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            (reinterpret_cast<VecElemU32*>(
+                                gpuDynInst->d_data))[lane * N + i]
+                                = wf->ldsChunk->read<VecElemU32>(
+                                        vaddr + i*sizeof(VecElemU32));
+                        }
+                    }
+                }
+            }
        }

        template<typename T>
        void
        initMemWrite(GPUDynInstPtr gpuDynInst)
        {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        wf->ldsChunk->write<T>(vaddr,
+                            (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
+                    }
+                }
+            }
        }

        template<int N>
        void
        initMemWrite(GPUDynInstPtr gpuDynInst)
        {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            wf->ldsChunk->write<VecElemU32>(
+                                vaddr + i*sizeof(VecElemU32),
+                                (reinterpret_cast<VecElemU32*>(
+                                    gpuDynInst->d_data))[lane * N + i]);
+                        }
+                    }
+                }
+            }
        }

        template<typename T>
        void
        initAtomicAccess(GPUDynInstPtr gpuDynInst)
        {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        AtomicOpFunctor* amo_op =
+                            gpuDynInst->makeAtomicOpFunctor<T>(
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->a_data))[lane],
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->x_data))[lane]).get();
+
+                        T tmp = wf->ldsChunk->read<T>(vaddr);
+                        (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
+                        wf->ldsChunk->write<T>(vaddr, tmp);
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
+                    }
+                }
+            }
        }

        void
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -834,7 +834,10 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
            if (mask[lane]) {
                // flat address calculation goes here.
                // addr[lane] = segmented address
-                panic("Flat group memory operation is unimplemented!\n");
+                addr[lane] = addr[lane] -
+                    wavefront()->computeUnit->shader->ldsApe().base;
+                assert(addr[lane] <
+                  wavefront()->computeUnit->getLds().getAddrRange().size());
            }
        }
        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -76,6 +76,11 @@ LocalMemPipeline::exec()
        lmReturnedRequests.pop();
        w = m->wavefront();

+        if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel()
+            && m->allLanesZero()) {
+            computeUnit.getTokenManager()->recvTokens(1);
+        }
+
        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
                m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
        m->completeAcc(m);