gpu-compute: Adding support for LDS atomics

This changeset is adding support for LDS atomics
and implementing DS_OR_B32 instruction.

Change-Id: I84c5cf6ce0e9494726dc7299f360551cd2a485f5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/61791
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Alexandru Dutu
2022-07-15 15:58:07 -07:00
committed by Alexandru Duțu
parent 2816598831
commit c6b38909e1
5 changed files with 102 additions and 2 deletions

View File

@@ -34181,6 +34181,10 @@ namespace VegaISA
Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
: Inst_DS(iFmt, "ds_or_b32")
{
setFlag(MemoryRef);
setFlag(GroupSegment);
setFlag(AtomicOr);
setFlag(AtomicReturn);
} // Inst_DS__DS_OR_B32
Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
@@ -34195,8 +34199,60 @@ namespace VegaISA
void
Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decLGKMInstsIssued();
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
addr.read();
data.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
} // execute
void
Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
{
Addr offset0 = instData.OFFSET0;
Addr offset1 = instData.OFFSET1;
Addr offset = (offset1 << 8) | offset0;
initAtomicAccess<VecElemU32>(gpuDynInst, offset);
} // initiateAcc
void
Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
{
VecOperandU32 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
} // completeAcc
// --- Inst_DS__DS_XOR_B32 class methods ---
Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)

View File

@@ -31496,6 +31496,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr gpuDynInst);
void completeAcc(GPUDynInstPtr gpuDynInst);
}; // Inst_DS__DS_OR_B32
class Inst_DS__DS_XOR_B32 : public Inst_DS

View File

@@ -504,6 +504,27 @@ namespace VegaISA
}
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
AtomicOpFunctorPtr amo_op =
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));
}
}
}
void
calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
{

View File

@@ -101,6 +101,27 @@ class LdsChunk
*p0 = value;
}
/**
* an atomic operation
*/
template<class T>
T
atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
{
/**
* Atomics that are outside the bounds of the LDS
* chunk allocated to this WG are dropped.
*/
if (index >= chunk.size()) {
return (T)0;
}
T *p0 = (T *) (&(chunk.at(index)));
T tmp = *p0;
(*amoOp)((uint8_t *)p0);
return tmp;
}
/**
* get the size of this chunk
*/

View File

@@ -154,7 +154,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
ii->isFlatGlobal() || ii->isSleep())) {
ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
panic("next instruction: %s is of unknown type\n", ii->disassemble());
}