gpu-compute: Adding support for LDS atomics
This changeset is adding support for LDS atomics and implementing DS_OR_B32 instruction. Change-Id: I84c5cf6ce0e9494726dc7299f360551cd2a485f5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/61791 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Alexandru Duțu
parent
2816598831
commit
c6b38909e1
@@ -34181,6 +34181,10 @@ namespace VegaISA
|
||||
Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
|
||||
: Inst_DS(iFmt, "ds_or_b32")
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(GroupSegment);
|
||||
setFlag(AtomicOr);
|
||||
setFlag(AtomicReturn);
|
||||
} // Inst_DS__DS_OR_B32
|
||||
|
||||
Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
|
||||
@@ -34195,8 +34199,60 @@ namespace VegaISA
|
||||
void
|
||||
Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
if (gpuDynInst->exec_mask.none()) {
|
||||
wf->decLGKMInstsIssued();
|
||||
return;
|
||||
}
|
||||
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(
|
||||
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
|
||||
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
|
||||
ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
|
||||
|
||||
addr.read();
|
||||
data.read();
|
||||
|
||||
calcAddr(gpuDynInst, addr);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
|
||||
= data[lane];
|
||||
}
|
||||
}
|
||||
|
||||
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Addr offset0 = instData.OFFSET0;
|
||||
Addr offset1 = instData.OFFSET1;
|
||||
Addr offset = (offset1 << 8) | offset0;
|
||||
|
||||
initAtomicAccess<VecElemU32>(gpuDynInst, offset);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst(gpuDynInst, extData.VDST);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
vdst[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane];
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // completeAcc
|
||||
|
||||
// --- Inst_DS__DS_XOR_B32 class methods ---
|
||||
|
||||
Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
|
||||
|
||||
@@ -31496,6 +31496,8 @@ namespace VegaISA
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr gpuDynInst);
|
||||
void completeAcc(GPUDynInstPtr gpuDynInst);
|
||||
}; // Inst_DS__DS_OR_B32
|
||||
|
||||
class Inst_DS__DS_XOR_B32 : public Inst_DS
|
||||
|
||||
@@ -504,6 +504,27 @@ namespace VegaISA
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
||||
|
||||
AtomicOpFunctorPtr amo_op =
|
||||
gpuDynInst->makeAtomicOpFunctor<T>(
|
||||
&(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
|
||||
&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);
|
||||
|
||||
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
|
||||
= wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
|
||||
{
|
||||
|
||||
@@ -101,6 +101,27 @@ class LdsChunk
|
||||
*p0 = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* an atomic operation
|
||||
*/
|
||||
template<class T>
|
||||
T
|
||||
atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
|
||||
{
|
||||
/**
|
||||
* Atomics that are outside the bounds of the LDS
|
||||
* chunk allocated to this WG are dropped.
|
||||
*/
|
||||
if (index >= chunk.size()) {
|
||||
return (T)0;
|
||||
}
|
||||
T *p0 = (T *) (&(chunk.at(index)));
|
||||
T tmp = *p0;
|
||||
|
||||
(*amoOp)((uint8_t *)p0);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the size of this chunk
|
||||
*/
|
||||
|
||||
@@ -154,7 +154,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
|
||||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
|
||||
ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
|
||||
ii->isFlatGlobal() || ii->isSleep())) {
|
||||
ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
|
||||
panic("next instruction: %s is of unknown type\n", ii->disassemble());
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user