arch-gcn3: implement multi-dword buffer loads and stores
Add support for all multi-dword buffer loads and stores: buffer_load_dword x2, x3, and x4 and buffer_store_dword x2, x3, and x4 Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29946 Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
This commit is contained in:
committed by
Anthony Gutierrez
parent
0c5d671ea1
commit
ed3135ea6a
@@ -34777,7 +34777,11 @@ namespace Gcn3ISA
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Load);
|
||||
setFlag(GlobalSegment);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_LOAD_DWORDX2
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
|
||||
@@ -34788,17 +34792,88 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
}
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->exec_mask = wf->execMask();
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
if (isLocalMem()) {
|
||||
gpuDynInst->computeUnit()->localMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->rdLmReqsInPipe--;
|
||||
wf->outstandingReqsRdLm++;
|
||||
} else {
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->rdGmReqsInPipe--;
|
||||
wf->outstandingReqsRdGm++;
|
||||
}
|
||||
|
||||
wf->outstandingReqs++;
|
||||
wf->validateRequestCounters();
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemRead<2>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
|
||||
VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
if (!oobMask[lane]) {
|
||||
vdst0[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 2];
|
||||
vdst1[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 2 + 1];
|
||||
} else {
|
||||
vdst0[lane] = 0;
|
||||
vdst1[lane] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vdst0.write();
|
||||
vdst1.write();
|
||||
} // completeAcc
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX3
|
||||
@@ -34807,7 +34882,11 @@ namespace Gcn3ISA
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Load);
|
||||
setFlag(GlobalSegment);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_LOAD_DWORDX3
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
|
||||
@@ -34818,17 +34897,93 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
}
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->exec_mask = wf->execMask();
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
if (isLocalMem()) {
|
||||
gpuDynInst->computeUnit()->localMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->rdLmReqsInPipe--;
|
||||
wf->outstandingReqsRdLm++;
|
||||
} else {
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->rdGmReqsInPipe--;
|
||||
wf->outstandingReqsRdGm++;
|
||||
}
|
||||
|
||||
wf->outstandingReqs++;
|
||||
wf->validateRequestCounters();
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemRead<3>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
|
||||
VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
|
||||
VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
if (!oobMask[lane]) {
|
||||
vdst0[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 3];
|
||||
vdst1[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 3 + 1];
|
||||
vdst2[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 3 + 2];
|
||||
} else {
|
||||
vdst0[lane] = 0;
|
||||
vdst1[lane] = 0;
|
||||
vdst2[lane] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vdst0.write();
|
||||
vdst1.write();
|
||||
vdst2.write();
|
||||
} // completeAcc
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX4
|
||||
@@ -34837,7 +34992,11 @@ namespace Gcn3ISA
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Load);
|
||||
setFlag(GlobalSegment);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_LOAD_DWORDX4
|
||||
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
|
||||
@@ -34848,17 +35007,98 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
}
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->exec_mask = wf->execMask();
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
if (isLocalMem()) {
|
||||
gpuDynInst->computeUnit()->localMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->rdLmReqsInPipe--;
|
||||
wf->outstandingReqsRdLm++;
|
||||
} else {
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->rdGmReqsInPipe--;
|
||||
wf->outstandingReqsRdGm++;
|
||||
}
|
||||
|
||||
wf->outstandingReqs++;
|
||||
wf->validateRequestCounters();
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemRead<4>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
|
||||
VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
|
||||
VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
|
||||
VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
if (!oobMask[lane]) {
|
||||
vdst0[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4];
|
||||
vdst1[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 1];
|
||||
vdst2[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 2];
|
||||
vdst3[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane * 4 + 3];
|
||||
} else {
|
||||
vdst0[lane] = 0;
|
||||
vdst1[lane] = 0;
|
||||
vdst2[lane] = 0;
|
||||
vdst3[lane] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vdst0.write();
|
||||
vdst1.write();
|
||||
vdst2.write();
|
||||
vdst3.write();
|
||||
} // completeAcc
|
||||
|
||||
Inst_MUBUF__BUFFER_STORE_BYTE
|
||||
@@ -35155,7 +35395,11 @@ namespace Gcn3ISA
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Store);
|
||||
setFlag(GlobalSegment);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_STORE_DWORDX2
|
||||
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
|
||||
@@ -35166,12 +35410,77 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
}
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->exec_mask = wf->execMask();
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
|
||||
ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
data0.read();
|
||||
data1.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
if (isLocalMem()) {
|
||||
gpuDynInst->computeUnit()->localMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->wrLmReqsInPipe--;
|
||||
wf->outstandingReqsWrLm++;
|
||||
} else {
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->wrGmReqsInPipe--;
|
||||
wf->outstandingReqsWrGm++;
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
|
||||
= data0[lane];
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
|
||||
= data1[lane];
|
||||
}
|
||||
}
|
||||
|
||||
wf->outstandingReqs++;
|
||||
wf->validateRequestCounters();
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemWrite<2>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
@@ -35185,7 +35494,11 @@ namespace Gcn3ISA
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Store);
|
||||
setFlag(GlobalSegment);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_STORE_DWORDX3
|
||||
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
|
||||
@@ -35196,12 +35509,81 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
}
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->exec_mask = wf->execMask();
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
|
||||
ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
|
||||
ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
data0.read();
|
||||
data1.read();
|
||||
data2.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
if (isLocalMem()) {
|
||||
gpuDynInst->computeUnit()->localMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->wrLmReqsInPipe--;
|
||||
wf->outstandingReqsWrLm++;
|
||||
} else {
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->wrGmReqsInPipe--;
|
||||
wf->outstandingReqsWrGm++;
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
|
||||
= data0[lane];
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
|
||||
= data1[lane];
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
|
||||
= data2[lane];
|
||||
}
|
||||
}
|
||||
|
||||
wf->outstandingReqs++;
|
||||
wf->validateRequestCounters();
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemWrite<3>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
@@ -35215,7 +35597,11 @@ namespace Gcn3ISA
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Store);
|
||||
setFlag(GlobalSegment);
|
||||
if (instData.LDS) {
|
||||
setFlag(GroupSegment);
|
||||
} else {
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
} // Inst_MUBUF__BUFFER_STORE_DWORDX4
|
||||
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
|
||||
@@ -35226,12 +35612,85 @@ namespace Gcn3ISA
|
||||
void
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
}
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->exec_mask = wf->execMask();
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
|
||||
ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
|
||||
ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
|
||||
ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
data0.read();
|
||||
data1.read();
|
||||
data2.read();
|
||||
data3.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
if (isLocalMem()) {
|
||||
gpuDynInst->computeUnit()->localMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->wrLmReqsInPipe--;
|
||||
wf->outstandingReqsWrLm++;
|
||||
} else {
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
wf->wrGmReqsInPipe--;
|
||||
wf->outstandingReqsWrGm++;
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
|
||||
= data0[lane];
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
|
||||
= data1[lane];
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
|
||||
= data2[lane];
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
|
||||
= data3[lane];
|
||||
}
|
||||
}
|
||||
|
||||
wf->outstandingReqs++;
|
||||
wf->validateRequestCounters();
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemWrite<4>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
|
||||
@@ -505,6 +505,20 @@ namespace Gcn3ISA
|
||||
gpuDynInst->exec_mask = old_exec_mask;
|
||||
}
|
||||
|
||||
|
||||
template<int N>
|
||||
void
|
||||
initMemRead(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
// temporarily modify exec_mask to supress memory accesses to oob
|
||||
// regions. Only issue memory requests for lanes that have their
|
||||
// exec_mask set and are not out of bounds.
|
||||
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
||||
gpuDynInst->exec_mask &= ~oobMask;
|
||||
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
|
||||
gpuDynInst->exec_mask = old_exec_mask;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
initMemWrite(GPUDynInstPtr gpuDynInst)
|
||||
@@ -518,6 +532,19 @@ namespace Gcn3ISA
|
||||
gpuDynInst->exec_mask = old_exec_mask;
|
||||
}
|
||||
|
||||
template<int N>
|
||||
void
|
||||
initMemWrite(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
// temporarily modify exec_mask to supress memory accesses to oob
|
||||
// regions. Only issue memory requests for lanes that have their
|
||||
// exec_mask set and are not out of bounds.
|
||||
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
||||
gpuDynInst->exec_mask &= ~oobMask;
|
||||
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
|
||||
gpuDynInst->exec_mask = old_exec_mask;
|
||||
}
|
||||
|
||||
void
|
||||
injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user