diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 5ddd7756ba..f5bf0192bc 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -420,6 +420,12 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset) { DPRINTF(AMDGPUDevice, "Wrote framebuffer address %#lx\n", offset); + for (auto& cu: CP()->shader()->cuList) { + auto system = CP()->shader()->gpuCmdProc.system(); + Addr aligned_addr = offset & ~(system->cacheLineSize() - 1); + cu->sendInvL2(aligned_addr); + } + Addr aperture = gpuvm.getFrameAperture(offset); Addr aperture_offset = offset - aperture; diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index e485aa6161..5daa82e576 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -864,6 +864,25 @@ ComputeUnit::DataPort::handleResponse(PacketPtr pkt) // - kernel end // - non-kernel mem sync + // Non-kernel mem sync not from an instruction + if (!gpuDynInst) { + // If there is no dynamic instruction, a CU must be present. + ComputeUnit *cu = sender_state->computeUnit; + assert(cu != nullptr); + + if (pkt->req->isInvL2()) { + cu->shader->decNumOutstandingInvL2s(); + assert(cu->shader->getNumOutstandingInvL2s() >= 0); + } else { + panic("Unknown MemSyncResp not from an instruction"); + } + + // Cleanup and return, no other response events needed. + delete pkt->senderState; + delete pkt; + return true; + } + // Kernel Launch // wavefront was nullptr when launching kernel, so it is meaningless // here (simdId=-1, wfSlotId=-1) @@ -1403,6 +1422,23 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, } } +void +ComputeUnit::sendInvL2(Addr paddr) +{ + auto req = std::make_shared(paddr, 64, 0, vramRequestorId()); + req->setCacheCoherenceFlags(Request::GL2_CACHE_INV); + + auto pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::DataPort::SenderState(this, 0, nullptr)); + + EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt); + + schedule(mem_req_event, curTick() + req_tick_latency); + + shader->incNumOutstandingInvL2s(); +} + void ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) { @@ -1701,16 +1737,20 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt) } else if (!(sendTimingReq(pkt))) { retries.push_back(std::make_pair(pkt, gpuDynInst)); - DPRINTF(GPUPort, - "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", - compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, - id, pkt->req->getPaddr()); + if (gpuDynInst) { + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, id, pkt->req->getPaddr()); + } } else { - DPRINTF(GPUPort, - "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " - "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id, - pkt->req->getPaddr()); + if (gpuDynInst) { + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data" + " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id, + pkt->req->getPaddr()); + } } } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index cfa145551f..6cdc22ea57 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -474,6 +474,8 @@ class ComputeUnit : public ClockedObject void handleSQCReturn(PacketPtr pkt); + void sendInvL2(Addr paddr); + protected: RequestorID _requestorId; @@ -527,6 +529,7 @@ class ComputeUnit : public ClockedObject struct SenderState : public Packet::SenderState { + ComputeUnit *computeUnit = nullptr; GPUDynInstPtr _gpuDynInst; PortID port_index; Packet::SenderState *saved; @@ -536,6 +539,12 @@ class ComputeUnit : public ClockedObject : _gpuDynInst(gpuDynInst), port_index(_port_index), saved(sender_state) { } + + SenderState(ComputeUnit *cu, PortID _port_index, + Packet::SenderState *sender_state=nullptr) + : computeUnit(cu), + port_index(_port_index), + saved(sender_state) { } }; class SystemHubEvent : public Event diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index 5093cc4ff2..2af54a262e 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -41,6 +41,7 @@ #include "debug/GPUKernelInfo.hh" #include "dev/amdgpu/amdgpu_device.hh" #include "gpu-compute/dispatcher.hh" +#include "gpu-compute/shader.hh" #include "mem/abstract_mem.hh" #include "mem/packet_access.hh" #include "mem/se_translating_port_proxy.hh" @@ -126,6 +127,21 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, unsigned akc_alignment_granularity = 64; assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1))); + /** + * Make sure there is not a race condition with invalidates in the L2 + * cache. The full system driver may write directly to memory using + * large BAR while the L2 cache is allowed to keep data in the valid + * state between kernel launches. This is a rare event but is required + * for correctness. + */ + if (shader()->getNumOutstandingInvL2s() > 0) { + DPRINTF(GPUCommandProc, + "Deferring kernel launch due to outstanding L2 invalidates\n"); + shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr); + + return; + } + /** * Need to use a raw pointer for DmaVirtDevice API. This is deleted * in the dispatchKernelObject method. diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 6e3d556026..437d590b70 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -552,6 +552,29 @@ Shader::notifyCuSleep() { } } +void +Shader::decNumOutstandingInvL2s() +{ + num_outstanding_invl2s--; + + if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) { + for (auto &dispatch : deferred_dispatches) { + gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch), + std::get<1>(dispatch), + std::get<2>(dispatch)); + } + deferred_dispatches.clear(); + } +} + +void +Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr) +{ + deferred_dispatches.push_back( + std::make_tuple(raw_pkt, queue_id, host_pkt_addr)); +} + /** * Forward the VRAM requestor ID needed for device memory from CP. */ diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 0287ddc169..c68f4d15b6 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -104,6 +104,11 @@ class Shader : public ClockedObject // Set to true by the dispatcher if the current kernel is a blit kernel bool blitKernel = false; + // Number of pending non-instruction invalidates outstanding. The shader + // should wait for these to be done to ensure correctness. + int num_outstanding_invl2s = 0; + std::vector> deferred_dispatches; + public: typedef ShaderParams Params; enum hsail_mode_e {SIMT,VECTOR_SCALAR}; @@ -330,6 +335,13 @@ class Shader : public ClockedObject blitKernel = is_blit_kernel; } + void decNumOutstandingInvL2s(); + void incNumOutstandingInvL2s() { num_outstanding_invl2s++; }; + int getNumOutstandingInvL2s() const { return num_outstanding_invl2s; }; + + void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr); + protected: struct ShaderStats : public statistics::Group { diff --git a/src/mem/request.hh b/src/mem/request.hh index 783e4212ab..80bd4c817a 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -1096,6 +1096,7 @@ class Request : public Extensible * setting extraFlags should be done via setCacheCoherenceFlags(). */ bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); } + bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); } bool isGL2CacheFlush() const diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index 5812eef577..f6ac25be36 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache") L2_Repl, desc="L2 Replacement"; // Probes PrbInv, desc="Invalidating probe"; + InvCache, desc="Invalidating probe from TCP"; // Coming from Memory Controller WBAck, desc="writethrough ack from memory"; Bypass, desc="Bypass the entire L2 cache"; @@ -413,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache") } } else if (in_msg.Type == CoherenceRequestType:WriteFlush) { trigger(Event:Flush, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:InvCache) { + trigger(Event:InvCache, in_msg.addr, cache_entry, tbe); } else { DPRINTF(RubySlicc, "%s\n", in_msg); error("Unexpected Response Message to Core"); @@ -429,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache") unset_cache_entry(); } + action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:InvL2Resp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + action(sd_sendData, "sd", desc="send Shared response") { peek(coreRequestNetwork_in, CPURequestMsg) { enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { @@ -1188,6 +1204,12 @@ machine(MachineType:TCC, "TCC Cache") i_invL2; } + transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} { + i_invL2; + ir_invL2Resp; + p_popRequestQueue; + } + transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} { pi_sendProbeResponseInv; pp_popProbeQueue; diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index 97997a12b5..1ad935324c 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") Evict, desc="Evict if clean(invL1 for Load Acquire)"; // Mem sys initiated Repl, desc="Replacing block from cache"; + InvL2, desc="Invalidate to L2"; + InvL2Resp, desc="Invalidate L2 completed"; // TCC initiated TCC_Ack, desc="TCC Ack to Core Request"; @@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") in_msg.Type == CoherenceResponseType:NBSysWBAck) { trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); DPRINTF(RubySlicc, "Issuing TCC_AckWB\n"); - } else { - error("Unexpected Response Message to Core"); - } + } else if (in_msg.Type == CoherenceResponseType:InvL2Resp) { + DPRINTF(RubySlicc, "Issuing InvL2Resp\n"); + trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } } } } @@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe); } else if (in_msg.Type == RubyRequestType:REPLACEMENT){ trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe); + } else if (in_msg.Type == RubyRequestType:InvL2){ + trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe); } else { error("Unexpected Request Message from VIC"); } @@ -609,6 +616,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } } + action(il2_invL2, "il2", desc="Invalidate address in L2") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:InvCache; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + peek(mandatoryQueue_in, RubyRequest) { + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } + } + } + + action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") { + if (use_seq_not_coal) { + DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n"); + assert(false); + } else { + coalescer.invTCCCallback(address); + } + } + action(wd_wtDone, "wd", desc="writethrough done") { if (use_seq_not_coal) { DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n"); @@ -830,6 +862,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") p_popMandatoryQueue; } + transition(I, InvL2) { + il2_invL2; + p_popMandatoryQueue; + } + + transition(V, InvL2, I) { + ic_invCache + il2_invL2; + p_popMandatoryQueue; + } + + transition(I, InvL2Resp) { + i2r_invL2Resp; + pr_popResponseQueue; + } + // if a line is in IV and a TCC_AckWB comes back, we must have had a WT // store followed by a load. Thus, complete the store without affecting // TBE or line state. diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm index 9074a86b52..106433f2c5 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm @@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") { Cycles, Cycles, Cycles, bool); void atomicCallback(Addr, MachineType, DataBlock); void invTCPCallback(Addr); + void invTCCCallback(Addr); void writeCompleteCallback(Addr, uint64_t); void evictionCallback(Addr); } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm index b860ff1681..cb5a8c3a95 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm @@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") { WriteThroughFifo, desc="WriteThrough with no data"; WriteThroughDummy, desc="WriteThrough with no data for atomic operation"; WriteFlush, desc="Release Flush"; + InvCache, desc="Invalidate Cache"; WrCancel, desc="want to cancel WB to Memory"; // should this be here? @@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") { StaleNotif, desc="Notification of Stale WBAck, No data to writeback"; CPUCancelWB, desc="want to cancel WB to Memory"; MemData, desc="Data from Memory"; + InvL2Resp, desc="Invalidate L2 response"; // for regions PrivateAck, desc="Ack that r-buf received private notify"; diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm index ca44fd3780..5a7324cb72 100644 --- a/src/mem/ruby/protocol/RubySlicc_Exports.sm +++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm @@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") { COMMIT, desc="Commit version"; NULL, desc="Invalid request type"; FLUSH, desc="Flush request type"; + InvL2, desc="Invalidate L2"; Release, desc="Release operation"; Acquire, desc="Acquire opertion"; AcquireRelease, desc="Acquire and Release opertion"; diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index 90d6031c6e..5ee4105597 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -669,14 +669,14 @@ GPUCoalescer::getRequestType(PacketPtr pkt) RequestStatus GPUCoalescer::makeRequest(PacketPtr pkt) { - // all packets must have valid instruction sequence numbers - assert(pkt->req->hasInstSeqNum()); - if (pkt->cmd == MemCmd::MemSyncReq) { // issue mem_sync requests immediately to the cache system without // going through uncoalescedTable like normal LD/ST/Atomic requests issueMemSyncRequest(pkt); } else { + // all packets must have valid instruction sequence numbers + assert(pkt->req->hasInstSeqNum()); + // otherwise, this must be either read or write command assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush()); diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index 2adc41b578..47ceced3a7 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -80,6 +80,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt) // VIPER does not expect MemSyncReq & Release since compute unit // does not specify an equivalent type of memory request. assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) || + (pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) || pkt->cmd == MemCmd::ReadReq || pkt->cmd == MemCmd::WriteReq || pkt->cmd == MemCmd::FlushReq || @@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt) invTCP(); } + if (pkt->req->isInvL2()) { + invTCC(pkt); + } + return RequestStatus_Issued; } @@ -306,5 +311,51 @@ VIPERCoalescer::invTCP() m_num_pending_invs); } +void +VIPERCoalescer::invTCCCallback(Addr addr) +{ + for (auto& pkt : m_pending_invl2s[addr]) { + RubyPort::SenderState *ss = + safe_cast(pkt->senderState); + MemResponsePort *port = ss->port; + assert(port != nullptr); + + // Now convert to MemSyncResp + pkt->makeResponse(); + + pkt->senderState = ss->predecessor; + delete ss; + port->hitCallback(pkt); + } + m_pending_invl2s.erase(addr); +} + +/* + * Send an invalidate to a specific address in the TCC. + */ +void +VIPERCoalescer::invTCC(PacketPtr pkt) +{ + assert(pkt); + assert(pkt->req); + + Addr addr = pkt->req->getPaddr(); + RubyRequestType request_type = RubyRequestType_InvL2; + + std::shared_ptr msg = std::make_shared( + clockEdge(), addr, 0, 0, + request_type, RubyAccessMode_Supervisor, + nullptr); + + DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr); + + assert(m_mandatory_q_ptr); + Tick latency = cyclesToTicks( + m_controller->mandatoryQueueLatency(request_type)); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); + + m_pending_invl2s[addr].push_back(pkt); +} + } // namespace ruby } // namespace gem5 diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh index c7e21e946b..3054fc399c 100644 --- a/src/mem/ruby/system/VIPERCoalescer.hh +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer ~VIPERCoalescer(); void writeCompleteCallback(Addr address, uint64_t instSeqNum); void invTCPCallback(Addr address); + void invTCCCallback(Addr address); RequestStatus makeRequest(PacketPtr pkt) override; void issueRequest(CoalescedRequest* crequest) override; private: void invTCP(); + void invTCC(PacketPtr pkt); // make write-complete response packets from original write request packets void makeWriteCompletePkts(CoalescedRequest* crequest); @@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer // number of remaining cache lines to be invalidated in TCP int m_num_pending_invs; + // outstanding L2 invalidate packets + std::unordered_map> m_pending_invl2s; + // a map of instruction sequence number and corresponding pending // write-complete response packets. Each write-complete response // corresponds to a pending store request that is waiting for