arch-vega: Update FLAT memory access helpers to support LDS

This patch ports the changes from a similar patch for arch-gcn3:
https://gem5-review.googlesource.com/c/public/gem5/+/48343. Vega already
has an helper function to send to the correct pipe depending on the
scope, however the initMem helpers currently always assume global scope.

In addition the MUBUF WBINVL1 instructions are updated similarly to the
GCN3 patch.

Change-Id: I612b9198cb56e226721a90e72bba64395c84ebcd
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/55465
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matthew Poremba
2022-01-12 15:52:56 -06:00
parent ff17ecc177
commit 3ecd28a222
2 changed files with 91 additions and 7 deletions

View File

@@ -39848,7 +39848,13 @@ namespace VegaISA
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else {
fatal("Unsupported scope for flat instruction.\n");
}
} // execute
void
@@ -39901,7 +39907,13 @@ namespace VegaISA
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else {
fatal("Unsupported scope for flat instruction.\n");
}
} // execute
void
Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)

View File

@@ -800,35 +800,107 @@ namespace VegaISA
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->read<T>(vaddr);
}
}
}
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]
= wf->ldsChunk->read<VecElemU32>(
vaddr + i*sizeof(VecElemU32));
}
}
}
}
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
wf->ldsChunk->write<T>(vaddr,
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
}
}
}
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
wf->ldsChunk->write<VecElemU32>(
vaddr + i*sizeof(VecElemU32),
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]);
}
}
}
}
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
auto amo_op =
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(
gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(
gpuDynInst->x_data))[lane]);
T tmp = wf->ldsChunk->read<T>(vaddr);
(*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
wf->ldsChunk->write<T>(vaddr, tmp);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
}
}
}
}
void