arch-gcn3: Implement large ds_read/write instructions
This implements the 96 and 128b ds_read/write instructions in a similar fashion to the 3 and 4 dword flat_load/store instructions. These instructions are treated as reads/writes of 3 or 4 dwords, instead of as a single 96b/128b memory transaction, due to the limitations of the VecOperand class used in the amdgpu code. In order to handle treating the memory transaction as multiple dwords, the patch also adds in new initMemRead/initMemWrite functions for ds instructions. These are similar to the functions used in flat instructions for the same purpose. Change-Id: I0f2ba3cb7cf040abb876e6eae55a6d38149ee960 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48342 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Alex Dutu <alexandru.dutu@amd.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
This commit is contained in:
committed by
Matt Sinclair
parent
1415308d10
commit
523a92f7f0
@@ -34335,9 +34335,52 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(
|
||||
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
|
||||
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
|
||||
ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
|
||||
ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
|
||||
ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
|
||||
|
||||
addr.read();
|
||||
data0.read();
|
||||
data1.read();
|
||||
data2.read();
|
||||
|
||||
calcAddr(gpuDynInst, addr);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4] = data0[lane];
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
|
||||
}
|
||||
}
|
||||
|
||||
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
|
||||
}
|
||||
|
||||
void
|
||||
Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Addr offset0 = instData.OFFSET0;
|
||||
Addr offset1 = instData.OFFSET1;
|
||||
Addr offset = (offset1 << 8) | offset0;
|
||||
|
||||
initMemWrite<3>(gpuDynInst, offset);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
} // completeAcc
|
||||
|
||||
Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
|
||||
: Inst_DS(iFmt, "ds_write_b128")
|
||||
{
|
||||
@@ -34354,9 +34397,56 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(
|
||||
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
|
||||
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
|
||||
ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
|
||||
ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
|
||||
ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
|
||||
ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
|
||||
|
||||
addr.read();
|
||||
data0.read();
|
||||
data1.read();
|
||||
data2.read();
|
||||
data3.read();
|
||||
|
||||
calcAddr(gpuDynInst, addr);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4] = data0[lane];
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
|
||||
}
|
||||
}
|
||||
|
||||
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
|
||||
}
|
||||
|
||||
void
|
||||
Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Addr offset0 = instData.OFFSET0;
|
||||
Addr offset1 = instData.OFFSET1;
|
||||
Addr offset = (offset1 << 8) | offset0;
|
||||
|
||||
initMemWrite<4>(gpuDynInst, offset);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
} // completeAcc
|
||||
|
||||
Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
|
||||
: Inst_DS(iFmt, "ds_read_b96")
|
||||
{
|
||||
@@ -34372,7 +34462,51 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(
|
||||
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
|
||||
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
|
||||
|
||||
addr.read();
|
||||
|
||||
calcAddr(gpuDynInst, addr);
|
||||
|
||||
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
|
||||
}
|
||||
|
||||
void
|
||||
Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Addr offset0 = instData.OFFSET0;
|
||||
Addr offset1 = instData.OFFSET1;
|
||||
Addr offset = (offset1 << 8) | offset0;
|
||||
|
||||
initMemRead<3>(gpuDynInst, offset);
|
||||
}
|
||||
|
||||
void
|
||||
Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst0(gpuDynInst, extData.VDST);
|
||||
VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
|
||||
VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
vdst0[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4];
|
||||
vdst1[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 1];
|
||||
vdst2[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 2];
|
||||
}
|
||||
}
|
||||
|
||||
vdst0.write();
|
||||
vdst1.write();
|
||||
vdst2.write();
|
||||
}
|
||||
|
||||
Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
|
||||
@@ -34390,9 +34524,57 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(
|
||||
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
|
||||
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
|
||||
|
||||
addr.read();
|
||||
|
||||
calcAddr(gpuDynInst, addr);
|
||||
|
||||
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
|
||||
}
|
||||
|
||||
void
|
||||
Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Addr offset0 = instData.OFFSET0;
|
||||
Addr offset1 = instData.OFFSET1;
|
||||
Addr offset = (offset1 << 8) | offset0;
|
||||
|
||||
initMemRead<4>(gpuDynInst, offset);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst0(gpuDynInst, extData.VDST);
|
||||
VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
|
||||
VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
|
||||
VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
vdst0[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4];
|
||||
vdst1[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 1];
|
||||
vdst2[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 2];
|
||||
vdst3[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 3];
|
||||
}
|
||||
}
|
||||
|
||||
vdst0.write();
|
||||
vdst1.write();
|
||||
vdst2.write();
|
||||
vdst3.write();
|
||||
} // completeAcc
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_FORMAT_X
|
||||
::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
|
||||
: Inst_MUBUF(iFmt, "buffer_load_format_x")
|
||||
|
||||
@@ -35226,6 +35226,8 @@ namespace Gcn3ISA
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_DS__DS_WRITE_B96
|
||||
|
||||
class Inst_DS__DS_WRITE_B128 : public Inst_DS
|
||||
@@ -35258,6 +35260,8 @@ namespace Gcn3ISA
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_DS__DS_WRITE_B128
|
||||
|
||||
class Inst_DS__DS_READ_B96 : public Inst_DS
|
||||
@@ -35290,6 +35294,8 @@ namespace Gcn3ISA
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_DS__DS_READ_B96
|
||||
|
||||
class Inst_DS__DS_READ_B128 : public Inst_DS
|
||||
@@ -35322,6 +35328,8 @@ namespace Gcn3ISA
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_DS__DS_READ_B128
|
||||
|
||||
class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF
|
||||
|
||||
@@ -416,6 +416,25 @@ namespace Gcn3ISA
|
||||
}
|
||||
}
|
||||
|
||||
template<int N>
|
||||
void
|
||||
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * N + i]
|
||||
= wf->ldsChunk->read<VecElemU32>(
|
||||
vaddr + i*sizeof(VecElemU32));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
|
||||
@@ -450,6 +469,25 @@ namespace Gcn3ISA
|
||||
}
|
||||
}
|
||||
|
||||
template<int N>
|
||||
void
|
||||
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
wf->ldsChunk->write<VecElemU32>(
|
||||
vaddr + i*sizeof(VecElemU32),
|
||||
(reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * N + i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
|
||||
|
||||
Reference in New Issue
Block a user