arch-vega: Load/stores commonly used with 16b MFMA
This implements some missing loads and store that are commonly used in applications with MFMA instructions to load 16-bit data types into specific register locations: DS_READ_U16_D16, DS_READ_U16_D16_HI, BUFFER_LOAD_SHORT_D16, BUFFER_LOAD_SHORT_D16_HI. Change-Id: Ie22d81ef010328f4541553a9a674764dc16a9f4d
This commit is contained in:
@@ -7808,15 +7808,13 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_DS__DS_READ_U16_D16(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_DS__DS_READ_U16_D16(&iFmt->iFmt_DS);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_DS__DS_READ_U16_D16_HI(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_DS__DS_READ_U16_D16_HI(&iFmt->iFmt_DS);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
@@ -10207,14 +10205,12 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16(&iFmt->iFmt_MUBUF);
|
||||
}
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16_HI(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(&iFmt->iFmt_MUBUF);
|
||||
}
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_HI_X(MachInst iFmt)
|
||||
|
||||
@@ -1858,6 +1858,134 @@ namespace VegaISA
|
||||
|
||||
vdst.write();
|
||||
} // completeAcc
|
||||
// --- Inst_DS__DS_READ_U16_D16 class methods ---
|
||||
|
||||
Inst_DS__DS_READ_U16_D16::
|
||||
Inst_DS__DS_READ_U16_D16(InFmt_DS *iFmt)
|
||||
: Inst_DS(iFmt, "ds_read_u16_d16_hi")
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Load);
|
||||
} // Inst_DS__DS_READ_U16_D16
|
||||
|
||||
Inst_DS__DS_READ_U16_D16::~Inst_DS__DS_READ_U16_D16()
|
||||
{
|
||||
} // ~Inst_DS__DS_READ_U16_D16
|
||||
|
||||
// --- description from .arch file ---
|
||||
// RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
|
||||
// // RETURN_DATA[31:16] is preserved.
|
||||
void
|
||||
Inst_DS__DS_READ_U16_D16::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
if (gpuDynInst->exec_mask.none()) {
|
||||
wf->decLGKMInstsIssued();
|
||||
return;
|
||||
}
|
||||
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(
|
||||
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
|
||||
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
|
||||
|
||||
addr.read();
|
||||
|
||||
calcAddr(gpuDynInst, addr);
|
||||
|
||||
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
|
||||
} // execute
|
||||
void
|
||||
Inst_DS__DS_READ_U16_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Addr offset0 = instData.OFFSET0;
|
||||
Addr offset1 = instData.OFFSET1;
|
||||
Addr offset = (offset1 << 8) | offset0;
|
||||
|
||||
initMemRead<VecElemU16>(gpuDynInst, offset);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_DS__DS_READ_U16_D16::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst(gpuDynInst, extData.VDST);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
|
||||
gpuDynInst->d_data)[lane];
|
||||
replaceBits(vdst[lane], 15, 0, ds_val);
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // completeAcc
|
||||
// --- Inst_DS__DS_READ_U16_D16_HI class methods ---
|
||||
|
||||
Inst_DS__DS_READ_U16_D16_HI::
|
||||
Inst_DS__DS_READ_U16_D16_HI(InFmt_DS *iFmt)
|
||||
: Inst_DS(iFmt, "ds_read_u16_d16_hi")
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Load);
|
||||
} // Inst_DS__DS_READ_U16_D16_HI
|
||||
|
||||
Inst_DS__DS_READ_U16_D16_HI::~Inst_DS__DS_READ_U16_D16_HI()
|
||||
{
|
||||
} // ~Inst_DS__DS_READ_U16_D16_HI
|
||||
|
||||
// --- description from .arch file ---
|
||||
// RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16;
|
||||
// // RETURN_DATA[15:0] is preserved.
|
||||
void
|
||||
Inst_DS__DS_READ_U16_D16_HI::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
if (gpuDynInst->exec_mask.none()) {
|
||||
wf->decLGKMInstsIssued();
|
||||
return;
|
||||
}
|
||||
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(
|
||||
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
|
||||
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
|
||||
|
||||
addr.read();
|
||||
|
||||
calcAddr(gpuDynInst, addr);
|
||||
|
||||
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
|
||||
} // execute
|
||||
void
|
||||
Inst_DS__DS_READ_U16_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Addr offset0 = instData.OFFSET0;
|
||||
Addr offset1 = instData.OFFSET1;
|
||||
Addr offset = (offset1 << 8) | offset0;
|
||||
|
||||
initMemRead<VecElemU16>(gpuDynInst, offset);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_DS__DS_READ_U16_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst(gpuDynInst, extData.VDST);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
|
||||
gpuDynInst->d_data)[lane];
|
||||
replaceBits(vdst[lane], 31, 16, ds_val);
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // completeAcc
|
||||
// --- Inst_DS__DS_SWIZZLE_B32 class methods ---
|
||||
|
||||
Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
|
||||
|
||||
@@ -33126,6 +33126,74 @@ namespace VegaISA
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_DS__DS_READ_U16
|
||||
|
||||
class Inst_DS__DS_READ_U16_D16 : public Inst_DS
|
||||
{
|
||||
public:
|
||||
Inst_DS__DS_READ_U16_D16(InFmt_DS*);
|
||||
~Inst_DS__DS_READ_U16_D16();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 1; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: //vgpr_a
|
||||
return 4;
|
||||
case 1: //vgpr_rtn
|
||||
return 2;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_DS__DS_READ_U16_D16
|
||||
|
||||
class Inst_DS__DS_READ_U16_D16_HI : public Inst_DS
|
||||
{
|
||||
public:
|
||||
Inst_DS__DS_READ_U16_D16_HI(InFmt_DS*);
|
||||
~Inst_DS__DS_READ_U16_D16_HI();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 1; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: //vgpr_a
|
||||
return 4;
|
||||
case 1: //vgpr_rtn
|
||||
return 2;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_DS__DS_READ_U16_D16_HI
|
||||
|
||||
class Inst_DS__DS_SWIZZLE_B32 : public Inst_DS
|
||||
{
|
||||
public:
|
||||
@@ -36810,6 +36878,82 @@ namespace VegaISA
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_MUBUF__BUFFER_LOAD_SSHORT
|
||||
|
||||
class Inst_MUBUF__BUFFER_LOAD_SHORT_D16 : public Inst_MUBUF
|
||||
{
|
||||
public:
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF*);
|
||||
~Inst_MUBUF__BUFFER_LOAD_SHORT_D16();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 3; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: //vgpr_a
|
||||
return 8;
|
||||
case 1: //sgpr_r
|
||||
return 16;
|
||||
case 2: //sgpr_o
|
||||
return 4;
|
||||
case 3: //vgpr_d
|
||||
return 4;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
|
||||
|
||||
class Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI : public Inst_MUBUF
|
||||
{
|
||||
public:
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF*);
|
||||
~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 3; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: //vgpr_a
|
||||
return 8;
|
||||
case 1: //sgpr_r
|
||||
return 16;
|
||||
case 2: //sgpr_o
|
||||
return 4;
|
||||
case 3: //vgpr_d
|
||||
return 4;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
|
||||
|
||||
class Inst_MUBUF__BUFFER_LOAD_DWORD : public Inst_MUBUF
|
||||
{
|
||||
public:
|
||||
|
||||
@@ -823,6 +823,209 @@ namespace VegaISA
|
||||
Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
} // execute
|
||||
// --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16 class methods ---
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16
|
||||
::Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF *iFmt)
|
||||
: Inst_MUBUF(iFmt, "buffer_load_short_d16")
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Load);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
warn("BUFFER.LDS not implemented!");
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::~Inst_MUBUF__BUFFER_LOAD_SHORT_D16()
|
||||
{
|
||||
} // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16
|
||||
|
||||
// --- description from .arch file ---
|
||||
// RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
|
||||
// // RETURN_DATA[31:16] is preserved.
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
if (gpuDynInst->exec_mask.none()) {
|
||||
wf->decVMemInstsIssued();
|
||||
return;
|
||||
}
|
||||
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
// For explanation of buffer addressing, see section 9.1.5 in:
|
||||
// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
|
||||
// instruction-set-architectures/
|
||||
// amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemRead<VecElemU16>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst(gpuDynInst, extData.VDATA);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
if (!oobMask[lane]) {
|
||||
VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
|
||||
gpuDynInst->d_data))[lane];
|
||||
replaceBits(vdst[lane], 15, 0, buf_val);
|
||||
} else {
|
||||
vdst[lane] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // completeAcc
|
||||
// --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI class methods ---
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
|
||||
::Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF *iFmt)
|
||||
: Inst_MUBUF(iFmt, "buffer_load_short_d16_hi")
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Load);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
warn("BUFFER.LDS not implemented!");
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::
|
||||
~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI()
|
||||
{
|
||||
} // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
|
||||
|
||||
// --- description from .arch file ---
|
||||
// VDATA[31 : 16].b16 = MEM[ADDR].b16;
|
||||
// // VDATA[15:0] is preserved.
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
if (gpuDynInst->exec_mask.none()) {
|
||||
wf->decVMemInstsIssued();
|
||||
return;
|
||||
}
|
||||
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
// For explanation of buffer addressing, see section 9.1.5 in:
|
||||
// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
|
||||
// instruction-set-architectures/
|
||||
// amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemRead<VecElemU16>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst(gpuDynInst, extData.VDATA);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
if (!oobMask[lane]) {
|
||||
VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
|
||||
gpuDynInst->d_data))[lane];
|
||||
replaceBits(vdst[lane], 31, 16, buf_val);
|
||||
} else {
|
||||
vdst[lane] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // completeAcc
|
||||
// --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORD
|
||||
@@ -868,6 +1071,10 @@ namespace VegaISA
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
// For explanation of buffer addressing, see section 9.1.5 in:
|
||||
// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
|
||||
// instruction-set-architectures/
|
||||
// amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
|
||||
Reference in New Issue
Block a user