arch-gcn3: add support for flat atomic adds, subs, incs, decs

Add support for all missing flat atomic adds, subtracts, increments,
and decrements, including their x2 variants.

Change-Id: I37a67fcacca91a09a82be6597facaa366105d2dc
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/31974
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matt Sinclair
2020-07-29 15:16:53 -05:00
parent 9a250990cc
commit 4d84590dee
2 changed files with 410 additions and 6 deletions

View File

@@ -40643,8 +40643,72 @@ namespace Gcn3ISA
void
Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->exec_mask = wf->execMask();
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU32 data(gpuDynInst, extData.DATA);
addr.read();
data.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
wf->wrGmReqsInPipe--;
wf->outstandingReqsWrGm++;
wf->rdGmReqsInPipe--;
wf->outstandingReqsRdGm++;
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
gpuDynInst->wavefront()->outstandingReqs++;
gpuDynInst->wavefront()->validateRequestCounters();
}
void
Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initAtomicAccess<VecElemU32>(gpuDynInst);
} // initiateAcc
void
Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
VecOperandU32 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
} // completeAcc
Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_smin")
@@ -40843,9 +40907,74 @@ namespace Gcn3ISA
void
Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->exec_mask = wf->execMask();
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU32 data(gpuDynInst, extData.DATA);
addr.read();
data.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
wf->wrGmReqsInPipe--;
wf->outstandingReqsWrGm++;
wf->rdGmReqsInPipe--;
wf->outstandingReqsRdGm++;
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
gpuDynInst->wavefront()->outstandingReqs++;
gpuDynInst->wavefront()->validateRequestCounters();
}
void
Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initAtomicAccess<VecElemU32>(gpuDynInst);
} // initiateAcc
void
Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
VecOperandU32 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
} // completeAcc
Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_dec")
{
@@ -40868,9 +40997,74 @@ namespace Gcn3ISA
void
Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->exec_mask = wf->execMask();
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU32 data(gpuDynInst, extData.DATA);
addr.read();
data.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
wf->wrGmReqsInPipe--;
wf->outstandingReqsWrGm++;
wf->rdGmReqsInPipe--;
wf->outstandingReqsRdGm++;
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
gpuDynInst->wavefront()->outstandingReqs++;
gpuDynInst->wavefront()->validateRequestCounters();
}
void
Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initAtomicAccess<VecElemU32>(gpuDynInst);
} // initiateAcc
void
Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
VecOperandU32 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
} // completeAcc
Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_swap_x2")
@@ -41118,9 +41312,75 @@ namespace Gcn3ISA
void
Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->exec_mask = wf->execMask();
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU64 data(gpuDynInst, extData.DATA);
addr.read();
data.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
wf->wrGmReqsInPipe--;
wf->outstandingReqsWrGm++;
wf->rdGmReqsInPipe--;
wf->outstandingReqsRdGm++;
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
gpuDynInst->wavefront()->outstandingReqs++;
gpuDynInst->wavefront()->validateRequestCounters();
}
void
Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initAtomicAccess<VecElemU64>(gpuDynInst);
} // initiateAcc
void
Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
VecOperandU64 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<VecElemU64*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
} // completeAcc
Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_smin_x2")
@@ -41326,9 +41586,75 @@ namespace Gcn3ISA
void
Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->exec_mask = wf->execMask();
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU64 data(gpuDynInst, extData.DATA);
addr.read();
data.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
wf->wrGmReqsInPipe--;
wf->outstandingReqsWrGm++;
wf->rdGmReqsInPipe--;
wf->outstandingReqsRdGm++;
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
gpuDynInst->wavefront()->outstandingReqs++;
gpuDynInst->wavefront()->validateRequestCounters();
}
void
Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initAtomicAccess<VecElemU64>(gpuDynInst);
} // initiateAcc
void
Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
VecOperandU64 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<VecElemU64*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
} // completeAcc
Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_atomic_dec_x2")
@@ -41353,6 +41679,72 @@ namespace Gcn3ISA
void
Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->exec_mask = wf->execMask();
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU64 data(gpuDynInst, extData.DATA);
addr.read();
data.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
wf->wrGmReqsInPipe--;
wf->outstandingReqsWrGm++;
wf->rdGmReqsInPipe--;
wf->outstandingReqsRdGm++;
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
gpuDynInst->wavefront()->outstandingReqs++;
gpuDynInst->wavefront()->validateRequestCounters();
}
void
Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initAtomicAccess<VecElemU64>(gpuDynInst);
} // initiateAcc
void
Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
VecOperandU64 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<VecElemU64*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
} // completeAcc
} // namespace Gcn3ISA

View File

@@ -80189,6 +80189,8 @@ namespace Gcn3ISA
} // isDstOperand
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SUB
class Inst_FLAT__FLAT_ATOMIC_SMIN : public Inst_FLAT
@@ -80717,6 +80719,8 @@ namespace Gcn3ISA
} // isDstOperand
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_INC
class Inst_FLAT__FLAT_ATOMIC_DEC : public Inst_FLAT
@@ -80783,6 +80787,8 @@ namespace Gcn3ISA
} // isDstOperand
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_DEC
class Inst_FLAT__FLAT_ATOMIC_SWAP_X2 : public Inst_FLAT
@@ -81051,6 +81057,8 @@ namespace Gcn3ISA
} // isDstOperand
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SUB_X2
class Inst_FLAT__FLAT_ATOMIC_SMIN_X2 : public Inst_FLAT
@@ -81579,6 +81587,8 @@ namespace Gcn3ISA
} // isDstOperand
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_INC_X2
class Inst_FLAT__FLAT_ATOMIC_DEC_X2 : public Inst_FLAT
@@ -81645,6 +81655,8 @@ namespace Gcn3ISA
} // isDstOperand
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_DEC_X2
} // namespace Gcn3ISA