arch-vega: Load/stores commonly used with 16b MFMA

This implements some missing loads and store that are commonly used in
applications with MFMA instructions to load 16-bit data types into
specific register locations: DS_READ_U16_D16, DS_READ_U16_D16_HI,
BUFFER_LOAD_SHORT_D16, BUFFER_LOAD_SHORT_D16_HI.

Change-Id: Ie22d81ef010328f4541553a9a674764dc16a9f4d
This commit is contained in:
Matthew Poremba
2024-05-15 16:28:36 -07:00
parent a4f0d9e6be
commit b91c9be102
4 changed files with 483 additions and 8 deletions

View File

@@ -7808,15 +7808,13 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_DS__DS_READ_U16_D16(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_DS__DS_READ_U16_D16(&iFmt->iFmt_DS);
}
GPUStaticInst*
Decoder::decode_OP_DS__DS_READ_U16_D16_HI(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_DS__DS_READ_U16_D16_HI(&iFmt->iFmt_DS);
}
GPUStaticInst*
@@ -10207,14 +10205,12 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16(&iFmt->iFmt_MUBUF);
}
GPUStaticInst*
Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16_HI(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(&iFmt->iFmt_MUBUF);
}
GPUStaticInst*
Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_HI_X(MachInst iFmt)

View File

@@ -1858,6 +1858,134 @@ namespace VegaISA
vdst.write();
} // completeAcc
// --- Inst_DS__DS_READ_U16_D16 class methods ---
Inst_DS__DS_READ_U16_D16::
Inst_DS__DS_READ_U16_D16(InFmt_DS *iFmt)
: Inst_DS(iFmt, "ds_read_u16_d16_hi")
{
setFlag(MemoryRef);
setFlag(Load);
} // Inst_DS__DS_READ_U16_D16
Inst_DS__DS_READ_U16_D16::~Inst_DS__DS_READ_U16_D16()
{
} // ~Inst_DS__DS_READ_U16_D16
// --- description from .arch file ---
// RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
// // RETURN_DATA[31:16] is preserved.
void
Inst_DS__DS_READ_U16_D16::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decLGKMInstsIssued();
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
addr.read();
calcAddr(gpuDynInst, addr);
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
} // execute
void
Inst_DS__DS_READ_U16_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
{
Addr offset0 = instData.OFFSET0;
Addr offset1 = instData.OFFSET1;
Addr offset = (offset1 << 8) | offset0;
initMemRead<VecElemU16>(gpuDynInst, offset);
} // initiateAcc
void
Inst_DS__DS_READ_U16_D16::completeAcc(GPUDynInstPtr gpuDynInst)
{
VecOperandU32 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
gpuDynInst->d_data)[lane];
replaceBits(vdst[lane], 15, 0, ds_val);
}
}
vdst.write();
} // completeAcc
// --- Inst_DS__DS_READ_U16_D16_HI class methods ---
Inst_DS__DS_READ_U16_D16_HI::
Inst_DS__DS_READ_U16_D16_HI(InFmt_DS *iFmt)
: Inst_DS(iFmt, "ds_read_u16_d16_hi")
{
setFlag(MemoryRef);
setFlag(Load);
} // Inst_DS__DS_READ_U16_D16_HI
Inst_DS__DS_READ_U16_D16_HI::~Inst_DS__DS_READ_U16_D16_HI()
{
} // ~Inst_DS__DS_READ_U16_D16_HI
// --- description from .arch file ---
// RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16;
// // RETURN_DATA[15:0] is preserved.
void
Inst_DS__DS_READ_U16_D16_HI::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decLGKMInstsIssued();
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
addr.read();
calcAddr(gpuDynInst, addr);
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
} // execute
void
Inst_DS__DS_READ_U16_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
{
Addr offset0 = instData.OFFSET0;
Addr offset1 = instData.OFFSET1;
Addr offset = (offset1 << 8) | offset0;
initMemRead<VecElemU16>(gpuDynInst, offset);
} // initiateAcc
void
Inst_DS__DS_READ_U16_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
{
VecOperandU32 vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
gpuDynInst->d_data)[lane];
replaceBits(vdst[lane], 31, 16, ds_val);
}
}
vdst.write();
} // completeAcc
// --- Inst_DS__DS_SWIZZLE_B32 class methods ---
Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)

View File

@@ -33126,6 +33126,74 @@ namespace VegaISA
void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_READ_U16
class Inst_DS__DS_READ_U16_D16 : public Inst_DS
{
public:
Inst_DS__DS_READ_U16_D16(InFmt_DS*);
~Inst_DS__DS_READ_U16_D16();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 1; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_a
return 4;
case 1: //vgpr_rtn
return 2;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_READ_U16_D16
class Inst_DS__DS_READ_U16_D16_HI : public Inst_DS
{
public:
Inst_DS__DS_READ_U16_D16_HI(InFmt_DS*);
~Inst_DS__DS_READ_U16_D16_HI();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 1; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_a
return 4;
case 1: //vgpr_rtn
return 2;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_READ_U16_D16_HI
class Inst_DS__DS_SWIZZLE_B32 : public Inst_DS
{
public:
@@ -36810,6 +36878,82 @@ namespace VegaISA
void completeAcc(GPUDynInstPtr) override;
}; // Inst_MUBUF__BUFFER_LOAD_SSHORT
class Inst_MUBUF__BUFFER_LOAD_SHORT_D16 : public Inst_MUBUF
{
public:
Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF*);
~Inst_MUBUF__BUFFER_LOAD_SHORT_D16();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_a
return 8;
case 1: //sgpr_r
return 16;
case 2: //sgpr_o
return 4;
case 3: //vgpr_d
return 4;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
class Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI : public Inst_MUBUF
{
public:
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF*);
~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_a
return 8;
case 1: //sgpr_r
return 16;
case 2: //sgpr_o
return 4;
case 3: //vgpr_d
return 4;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
class Inst_MUBUF__BUFFER_LOAD_DWORD : public Inst_MUBUF
{
public:

View File

@@ -823,6 +823,209 @@ namespace VegaISA
Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16 class methods ---
Inst_MUBUF__BUFFER_LOAD_SHORT_D16
::Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF *iFmt)
: Inst_MUBUF(iFmt, "buffer_load_short_d16")
{
setFlag(MemoryRef);
setFlag(Load);
if (instData.LDS) {
setFlag(GroupSegment);
warn("BUFFER.LDS not implemented!");
} else {
setFlag(GlobalSegment);
}
} // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::~Inst_MUBUF__BUFFER_LOAD_SHORT_D16()
{
} // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16
// --- description from .arch file ---
// RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
// // RETURN_DATA[31:16] is preserved.
void
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
rsrcDesc.read();
offset.read();
int inst_offset = instData.OFFSET;
// For explanation of buffer addressing, see section 9.1.5 in:
// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
// instruction-set-architectures/
// amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
if (!instData.IDXEN && !instData.OFFEN) {
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr0, addr1, rsrcDesc, offset, inst_offset);
} else if (!instData.IDXEN && instData.OFFEN) {
addr0.read();
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr0, addr1, rsrcDesc, offset, inst_offset);
} else if (instData.IDXEN && !instData.OFFEN) {
addr0.read();
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr1, addr0, rsrcDesc, offset, inst_offset);
} else {
addr0.read();
addr1.read();
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr1, addr0, rsrcDesc, offset, inst_offset);
}
gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
} // execute
void
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initMemRead<VecElemU16>(gpuDynInst);
} // initiateAcc
void
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::completeAcc(GPUDynInstPtr gpuDynInst)
{
VecOperandU32 vdst(gpuDynInst, extData.VDATA);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
if (!oobMask[lane]) {
VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
gpuDynInst->d_data))[lane];
replaceBits(vdst[lane], 15, 0, buf_val);
} else {
vdst[lane] = 0;
}
}
}
vdst.write();
} // completeAcc
// --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI class methods ---
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
::Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF *iFmt)
: Inst_MUBUF(iFmt, "buffer_load_short_d16_hi")
{
setFlag(MemoryRef);
setFlag(Load);
if (instData.LDS) {
setFlag(GroupSegment);
warn("BUFFER.LDS not implemented!");
} else {
setFlag(GlobalSegment);
}
} // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::
~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI()
{
} // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
// --- description from .arch file ---
// VDATA[31 : 16].b16 = MEM[ADDR].b16;
// // VDATA[15:0] is preserved.
void
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
rsrcDesc.read();
offset.read();
int inst_offset = instData.OFFSET;
// For explanation of buffer addressing, see section 9.1.5 in:
// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
// instruction-set-architectures/
// amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
if (!instData.IDXEN && !instData.OFFEN) {
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr0, addr1, rsrcDesc, offset, inst_offset);
} else if (!instData.IDXEN && instData.OFFEN) {
addr0.read();
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr0, addr1, rsrcDesc, offset, inst_offset);
} else if (instData.IDXEN && !instData.OFFEN) {
addr0.read();
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr1, addr0, rsrcDesc, offset, inst_offset);
} else {
addr0.read();
addr1.read();
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
addr1, addr0, rsrcDesc, offset, inst_offset);
}
gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
} // execute
void
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initMemRead<VecElemU16>(gpuDynInst);
} // initiateAcc
void
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
{
VecOperandU32 vdst(gpuDynInst, extData.VDATA);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
if (!oobMask[lane]) {
VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
gpuDynInst->d_data))[lane];
replaceBits(vdst[lane], 31, 16, buf_val);
} else {
vdst[lane] = 0;
}
}
}
vdst.write();
} // completeAcc
// --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
Inst_MUBUF__BUFFER_LOAD_DWORD
@@ -868,6 +1071,10 @@ namespace VegaISA
int inst_offset = instData.OFFSET;
// For explanation of buffer addressing, see section 9.1.5 in:
// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
// instruction-set-architectures/
// amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
if (!instData.IDXEN && !instData.OFFEN) {
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,