diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py index 0818b7f0eb..d0a0c61083 100644 --- a/configs/ruby/GPU_VIPER.py +++ b/configs/ruby/GPU_VIPER.py @@ -497,13 +497,6 @@ def define_options(parser): parser.add_argument( "--noL1", action="store_true", default=False, help="bypassL1" ) - parser.add_argument( - "--scalar-buffer-size", - type=int, - default=128, - help="Size of the mandatory queue in the GPU scalar " - "cache controller", - ) parser.add_argument( "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency" ) @@ -841,9 +834,7 @@ def construct_scalars(options, system, ruby_system, network): scalar_cntrl.responseToSQC = MessageBuffer(ordered=True) scalar_cntrl.responseToSQC.in_port = network.out_port - scalar_cntrl.mandatoryQueue = MessageBuffer( - buffer_size=options.scalar_buffer_size - ) + scalar_cntrl.mandatoryQueue = MessageBuffer() return (scalar_sequencers, scalar_cntrl_nodes) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index daad5e9b40..1589e4564e 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -974,6 +974,14 @@ ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt) bool ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt) { + // From scalar cache invalidate that was issued at kernel start. + if (pkt->req->isKernel()) { + delete pkt->senderState; + delete pkt; + + return true; + } + assert(!pkt->req->isKernel()); // retrieve sender state @@ -1058,6 +1066,9 @@ ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) */ if (sender_state->wavefront != nullptr) { computeUnit->handleSQCReturn(pkt); + } else { + delete pkt->senderState; + delete pkt; } return true; diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc index 54819e7d3f..cd8dd30b00 100644 --- a/src/gpu-compute/scalar_memory_pipeline.cc +++ b/src/gpu-compute/scalar_memory_pipeline.cc @@ -180,7 +180,7 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst, // invalidation request to 0 and handle it in the sequencer req->setPaddr(0); - PacketPtr pkt = nullptr; + PacketPtr sqc_pkt = nullptr; // If kernelMemSync is true, then the invalidation request is from // kernel launch and is an implicit invalidation.If false, then it is @@ -189,8 +189,8 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst, req->setCacheCoherenceFlags(Request::INV_L1); req->setReqInstSeqNum(gpuDynInst->seqNum()); req->setFlags(Request::KERNEL); - pkt = new Packet(req, MemCmd::MemSyncReq); - pkt->pushSenderState( + sqc_pkt = new Packet(req, MemCmd::MemSyncReq); + sqc_pkt->pushSenderState( new ComputeUnit::SQCPort::SenderState( gpuDynInst->wavefront(), nullptr)); } else { @@ -198,17 +198,49 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst, req->setReqInstSeqNum(gpuDynInst->seqNum()); - pkt = new Packet(req, MemCmd::MemSyncReq); - pkt->pushSenderState( + sqc_pkt = new Packet(req, MemCmd::MemSyncReq); + sqc_pkt->pushSenderState( new ComputeUnit::SQCPort::SenderState( gpuDynInst->wavefront(), nullptr)); } ComputeUnit::SQCPort::MemReqEvent *sqc_event = new ComputeUnit::SQCPort::MemReqEvent - (computeUnit.sqcPort, pkt); + (computeUnit.sqcPort, sqc_pkt); computeUnit.schedule( sqc_event, curTick() + computeUnit.scalar_req_tick_latency); + + // When the SQC is invalidated, perform a scalar cache invalidate as well. + // The SQC and Scalar cache are implement using the same SLICC SM, so this + // invalidate is identical to the SQC invalidate, however we need to make + // a new packet and request as they have different cache destinations. + PacketPtr scalar_pkt = nullptr; + RequestPtr scalar_req(req); + + if (kernelMemSync) { + scalar_req->setCacheCoherenceFlags(Request::INV_L1); + scalar_req->setReqInstSeqNum(gpuDynInst->seqNum()); + scalar_req->setFlags(Request::KERNEL); + scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq); + scalar_pkt->pushSenderState( + new ComputeUnit::ScalarDataPort::SenderState( + gpuDynInst)); + } else { + gpuDynInst->setRequestFlags(scalar_req); + + scalar_req->setReqInstSeqNum(gpuDynInst->seqNum()); + + scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq); + scalar_pkt->pushSenderState( + new ComputeUnit::ScalarDataPort::SenderState( + gpuDynInst)); + } + + ComputeUnit::ScalarDataPort::MemReqEvent *scalar_event = + new ComputeUnit::ScalarDataPort::MemReqEvent + (computeUnit.scalarDataPort, scalar_pkt); + computeUnit.schedule( + scalar_event, curTick() + computeUnit.scalar_req_tick_latency); } } // namespace gem5