arch-vega: Implement all global atomics up to gfx90a

This change adds all of the missing flat/global atomics up to including
the new atomics in gfx90a (MI200). Adds all decodings and instruction
implementations with the exception of __half2 which does not have a
corresponding data type in gem5. This refactors the execute() and
completeAcc() methods by creating helper functions similar to what
initiateAcc() uses. This reduces redundant code for global atomic
instruction implementations.

Validated all except PK_ADD_F16, ADD_F32, and ADD_F64 which will be done
shortly. Verified the source/dest register sizes in the header are
correct and the template parameters for the new execute()/completeAcc()
methods are correct.

Change-Id: I4b3351229af401a1a4cbfb97166801aac67b74e4
This commit is contained in:
Matthew Poremba
2023-11-21 10:06:41 -06:00
parent 472c697d88
commit 7b0c47d52f
5 changed files with 798 additions and 349 deletions

View File

@@ -1679,9 +1679,9 @@ namespace VegaISA
&Decoder::decode_OP_FLAT__FLAT_ATOMIC_DEC,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD_F64,
&Decoder::decode_OP_FLAT__FLAT_ATOMIC_MIN_F64,
&Decoder::decode_OP_FLAT__FLAT_ATOMIC_MAX_F64,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -1808,11 +1808,11 @@ namespace VegaISA
&Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_XOR,
&Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_INC,
&Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_DEC,
&Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64,
&Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64,
&Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -8441,6 +8441,24 @@ namespace VegaISA
return new Inst_FLAT__FLAT_ATOMIC_DEC(&iFmt->iFmt_FLAT);
} // decode_OP_FLAT__FLAT_ATOMIC_DEC
GPUStaticInst*
Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD_F64(MachInst iFmt)
{
return new Inst_FLAT__FLAT_ATOMIC_ADD_F64(&iFmt->iFmt_FLAT);
} // decode_OP_FLAT__FLAT_ATOMIC_ADD_F64
GPUStaticInst*
Decoder::decode_OP_FLAT__FLAT_ATOMIC_MIN_F64(MachInst iFmt)
{
return new Inst_FLAT__FLAT_ATOMIC_MIN_F64(&iFmt->iFmt_FLAT);
} // decode_OP_FLAT__FLAT_ATOMIC_MIN_F64
GPUStaticInst*
Decoder::decode_OP_FLAT__FLAT_ATOMIC_MAX_F64(MachInst iFmt)
{
return new Inst_FLAT__FLAT_ATOMIC_MAX_F64(&iFmt->iFmt_FLAT);
} // decode_OP_FLAT__FLAT_ATOMIC_MAX_F64
GPUStaticInst*
Decoder::decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst iFmt)
{
@@ -8738,6 +8756,40 @@ namespace VegaISA
return new Inst_FLAT__FLAT_ATOMIC_DEC(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32(MachInst iFmt)
{
// Note: There is no flat_atomic_add_f32 as of MI200. However, gem5
// impelements all global and scratch instructions as Inst_FLAT.
return new Inst_FLAT__FLAT_ATOMIC_ADD_F32(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_PK_ADD_F16(MachInst iFmt)
{
// Note: There is no flat_atomic_pk_add_f16 as of MI200. However, gem5
// impelements all global and scratch instructions as Inst_FLAT.
return new Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64(MachInst iFmt)
{
return new Inst_FLAT__FLAT_ATOMIC_ADD_F64(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64(MachInst iFmt)
{
return new Inst_FLAT__FLAT_ATOMIC_MIN_F64(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64(MachInst iFmt)
{
return new Inst_FLAT__FLAT_ATOMIC_MAX_F64(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_SWAP_X2(MachInst iFmt)
{

View File

@@ -698,6 +698,9 @@ namespace VegaISA
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_XOR(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_INC(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_DEC(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_F64(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MIN_F64(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MAX_F64(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_X2(MachInst);
@@ -746,6 +749,11 @@ namespace VegaISA
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_XOR(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_INC(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_DEC(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_PK_ADD_F16(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_SWAP_X2(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_CMPSWAP_X2(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_X2(MachInst);

File diff suppressed because it is too large Load Diff

View File

@@ -42580,6 +42580,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SUB
class Inst_FLAT__FLAT_ATOMIC_SMIN : public Inst_FLAT
@@ -42656,6 +42658,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_UMIN
class Inst_FLAT__FLAT_ATOMIC_SMAX : public Inst_FLAT
@@ -42732,6 +42736,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_UMAX
class Inst_FLAT__FLAT_ATOMIC_AND : public Inst_FLAT
@@ -42769,6 +42775,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_AND
class Inst_FLAT__FLAT_ATOMIC_OR : public Inst_FLAT
@@ -42845,6 +42853,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_XOR
class Inst_FLAT__FLAT_ATOMIC_INC : public Inst_FLAT
@@ -42882,6 +42892,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_INC
class Inst_FLAT__FLAT_ATOMIC_DEC : public Inst_FLAT
@@ -42919,6 +42931,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_DEC
class Inst_FLAT__FLAT_ATOMIC_SWAP_X2 : public Inst_FLAT
@@ -42956,6 +42970,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
class Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 : public Inst_FLAT
@@ -43071,6 +43087,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SUB_X2
class Inst_FLAT__FLAT_ATOMIC_SMIN_X2 : public Inst_FLAT
@@ -43108,6 +43126,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
class Inst_FLAT__FLAT_ATOMIC_UMIN_X2 : public Inst_FLAT
@@ -43145,6 +43165,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
class Inst_FLAT__FLAT_ATOMIC_SMAX_X2 : public Inst_FLAT
@@ -43182,6 +43204,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
class Inst_FLAT__FLAT_ATOMIC_UMAX_X2 : public Inst_FLAT
@@ -43219,6 +43243,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
class Inst_FLAT__FLAT_ATOMIC_AND_X2 : public Inst_FLAT
@@ -43256,6 +43282,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_AND_X2
class Inst_FLAT__FLAT_ATOMIC_OR_X2 : public Inst_FLAT
@@ -43293,6 +43321,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_OR_X2
class Inst_FLAT__FLAT_ATOMIC_XOR_X2 : public Inst_FLAT
@@ -43330,6 +43360,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_XOR_X2
class Inst_FLAT__FLAT_ATOMIC_INC_X2 : public Inst_FLAT
@@ -43367,6 +43399,8 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_INC_X2
class Inst_FLAT__FLAT_ATOMIC_DEC_X2 : public Inst_FLAT
@@ -43404,8 +43438,205 @@ namespace VegaISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_DEC_X2
class Inst_FLAT__FLAT_ATOMIC_ADD_F32 : public Inst_FLAT
{
public:
Inst_FLAT__FLAT_ATOMIC_ADD_F32(InFmt_FLAT*);
~Inst_FLAT__FLAT_ATOMIC_ADD_F32();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_addr
return vgprIsOffset() ? 4 : 8;
case 1: //vgpr_src
return 4;
case 2: //vgpr_dst or saddr
return isFlat() ? 4 : 8;
case 3: //vgpr_dst
assert(!isFlat());
return 4;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_ADD_F32
class Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16 : public Inst_FLAT
{
public:
Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(InFmt_FLAT*);
~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_addr
return vgprIsOffset() ? 4 : 8;
case 1: //vgpr_src
return 4;
case 2: //vgpr_dst or saddr
return isFlat() ? 4 : 8;
case 3: //vgpr_dst
assert(!isFlat());
return 4;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
class Inst_FLAT__FLAT_ATOMIC_ADD_F64 : public Inst_FLAT
{
public:
Inst_FLAT__FLAT_ATOMIC_ADD_F64(InFmt_FLAT*);
~Inst_FLAT__FLAT_ATOMIC_ADD_F64();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_addr
return vgprIsOffset() ? 4 : 8;
case 1: //vgpr_src
return 8;
case 2: //vgpr_dst or saddr
return isFlat() ? 8 : 8;
case 3: //vgpr_dst
assert(!isFlat());
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_ADD_F64
class Inst_FLAT__FLAT_ATOMIC_MIN_F64 : public Inst_FLAT
{
public:
Inst_FLAT__FLAT_ATOMIC_MIN_F64(InFmt_FLAT*);
~Inst_FLAT__FLAT_ATOMIC_MIN_F64();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_addr
return vgprIsOffset() ? 4 : 8;
case 1: //vgpr_src
return 8;
case 2: //vgpr_dst or saddr
return isFlat() ? 8 : 8;
case 3: //vgpr_dst
assert(!isFlat());
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_MIN_F64
class Inst_FLAT__FLAT_ATOMIC_MAX_F64 : public Inst_FLAT
{
public:
Inst_FLAT__FLAT_ATOMIC_MAX_F64(InFmt_FLAT*);
~Inst_FLAT__FLAT_ATOMIC_MAX_F64();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_addr
return vgprIsOffset() ? 4 : 8;
case 1: //vgpr_src
return 8;
case 2: //vgpr_dst or saddr
return isFlat() ? 8 : 8;
case 3: //vgpr_dst
assert(!isFlat());
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_MAX_F64
class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P
{
public:

View File

@@ -1256,6 +1256,77 @@ namespace VegaISA
}
}
// Execute for atomics is identical besides the flag set in the
// constructor, except cmpswap. For cmpswap, the offset to the "cmp"
// register is needed. For all other operations this offset is zero
// and implies the atomic is not a cmpswap.
// RegT defines the type of GPU register (e.g., ConstVecOperandU32)
// LaneT defines the type of the register elements (e.g., VecElemU32)
template<typename RegT, typename LaneT, int CmpRegOffset = 0>
void
atomicExecute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
RegT data(gpuDynInst, extData.DATA);
RegT cmp(gpuDynInst, extData.DATA + CmpRegOffset);
data.read();
if constexpr (CmpRegOffset) {
cmp.read();
}
calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
if constexpr (CmpRegOffset) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->x_data))[lane] = data[lane];
(reinterpret_cast<VecElemU32*>(
gpuDynInst->a_data))[lane] = cmp[lane];
} else {
(reinterpret_cast<LaneT*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
}
issueRequestHelper(gpuDynInst);
}
// RegT defines the type of GPU register (e.g., ConstVecOperandU32)
// LaneT defines the type of the register elements (e.g., VecElemU32)
template<typename RegT, typename LaneT>
void
atomicComplete(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
RegT vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<LaneT*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
}
bool
vgprIsOffset()
{