gpu-compute: Invalidate Scalar cache when SQC invalidates (#1093)

The scalar cache is not being invalidated which causes stale data to be
left in the scalar cache between GPU kernels. This commit sends
invalidates to the scalar cache when the SQC is invalidated. This is a
sufficient baseline for simulation.

Since the number of invalidates might be larger than the mandatory queue
can hold and no flash invalidate mechanism exists in the VIPER protocol,
the command line option for the mandatory queue size is removed, which
is the same behavior as the SQC.

Change-Id: I1723f224711b04caa4c88beccfa8fb73ccf56572
This commit is contained in:
Matthew Poremba
2024-05-06 07:35:38 -07:00
committed by GitHub
parent 36c1ea9c61
commit 0d3d456894
3 changed files with 50 additions and 16 deletions

View File

@@ -497,13 +497,6 @@ def define_options(parser):
parser.add_argument(
"--noL1", action="store_true", default=False, help="bypassL1"
)
parser.add_argument(
"--scalar-buffer-size",
type=int,
default=128,
help="Size of the mandatory queue in the GPU scalar "
"cache controller",
)
parser.add_argument(
"--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
)
@@ -841,9 +834,7 @@ def construct_scalars(options, system, ruby_system, network):
scalar_cntrl.responseToSQC = MessageBuffer(ordered=True)
scalar_cntrl.responseToSQC.in_port = network.out_port
scalar_cntrl.mandatoryQueue = MessageBuffer(
buffer_size=options.scalar_buffer_size
)
scalar_cntrl.mandatoryQueue = MessageBuffer()
return (scalar_sequencers, scalar_cntrl_nodes)

View File

@@ -974,6 +974,14 @@ ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
bool
ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
{
// From scalar cache invalidate that was issued at kernel start.
if (pkt->req->isKernel()) {
delete pkt->senderState;
delete pkt;
return true;
}
assert(!pkt->req->isKernel());
// retrieve sender state
@@ -1058,6 +1066,9 @@ ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
*/
if (sender_state->wavefront != nullptr) {
computeUnit->handleSQCReturn(pkt);
} else {
delete pkt->senderState;
delete pkt;
}
return true;

View File

@@ -180,7 +180,7 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
// invalidation request to 0 and handle it in the sequencer
req->setPaddr(0);
PacketPtr pkt = nullptr;
PacketPtr sqc_pkt = nullptr;
// If kernelMemSync is true, then the invalidation request is from
// kernel launch and is an implicit invalidation.If false, then it is
@@ -189,8 +189,8 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
req->setCacheCoherenceFlags(Request::INV_L1);
req->setReqInstSeqNum(gpuDynInst->seqNum());
req->setFlags(Request::KERNEL);
pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
sqc_pkt->pushSenderState(
new ComputeUnit::SQCPort::SenderState(
gpuDynInst->wavefront(), nullptr));
} else {
@@ -198,17 +198,49 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
req->setReqInstSeqNum(gpuDynInst->seqNum());
pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
sqc_pkt->pushSenderState(
new ComputeUnit::SQCPort::SenderState(
gpuDynInst->wavefront(), nullptr));
}
ComputeUnit::SQCPort::MemReqEvent *sqc_event =
new ComputeUnit::SQCPort::MemReqEvent
(computeUnit.sqcPort, pkt);
(computeUnit.sqcPort, sqc_pkt);
computeUnit.schedule(
sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
// When the SQC is invalidated, perform a scalar cache invalidate as well.
// The SQC and Scalar cache are implement using the same SLICC SM, so this
// invalidate is identical to the SQC invalidate, however we need to make
// a new packet and request as they have different cache destinations.
PacketPtr scalar_pkt = nullptr;
RequestPtr scalar_req(req);
if (kernelMemSync) {
scalar_req->setCacheCoherenceFlags(Request::INV_L1);
scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
scalar_req->setFlags(Request::KERNEL);
scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
scalar_pkt->pushSenderState(
new ComputeUnit::ScalarDataPort::SenderState(
gpuDynInst));
} else {
gpuDynInst->setRequestFlags(scalar_req);
scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
scalar_pkt->pushSenderState(
new ComputeUnit::ScalarDataPort::SenderState(
gpuDynInst));
}
ComputeUnit::ScalarDataPort::MemReqEvent *scalar_event =
new ComputeUnit::ScalarDataPort::MemReqEvent
(computeUnit.scalarDataPort, scalar_pkt);
computeUnit.schedule(
scalar_event, curTick() + computeUnit.scalar_req_tick_latency);
}
} // namespace gem5