arch-vega,gpu-compute,mem-ruby: SQC Invalidation Support (#852)

This PR adds support for SQC (GPU I-cache) invalidation to the GPU
model. It does this by updating the GPU-VIPER-SQC protocol to support
flushes, the sequencer model to send out invalidates and the gpu compute
model to send invalidates and handle responses. It also adds support for
S_ICACHE_INV, a VEGA ISA instruction that invalidates the entire GPU
I-cache. Additionally, the PR modifies the kernel start behavior to
invalidate the I-cache too. It previously invalidated only the L1
D-cache.
This commit is contained in:
Matt Sinclair
2024-02-09 17:29:56 -06:00
committed by GitHub
16 changed files with 308 additions and 10 deletions

View File

@@ -434,6 +434,7 @@ print(
# shader is the GPU
shader = Shader(
n_wf=args.wfs_per_simd,
cu_per_sqc=args.cu_per_sqc,
clk_domain=SrcClockDomain(
clock=args.gpu_clock,
voltage_domain=VoltageDomain(voltage=args.gpu_voltage),

View File

@@ -33,7 +33,10 @@ from m5.objects import *
def createGPU(system, args):
shader = Shader(
n_wf=args.wfs_per_simd, timing=True, clk_domain=system.clk_domain
n_wf=args.wfs_per_simd,
cu_per_sqc=args.cu_per_sqc,
timing=True,
clk_domain=system.clk_domain,
)
# VIPER GPU protocol implements release consistency at GPU side. So,

View File

@@ -669,6 +669,9 @@ namespace VegaISA
Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_icache_inv")
{
setFlag(MemBarrier);
setFlag(GPUStaticInst::MemSync);
setFlag(MemSync);
} // Inst_SOPP__S_ICACHE_INV
Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
@@ -683,7 +686,26 @@ namespace VegaISA
void
Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decLGKMInstsIssued();
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
gpuDynInst->resetEntireStatusVector();
gpuDynInst->setStatusVector(0, 1);
RequestPtr req = std::make_shared<Request>(0, 0, 0,
gpuDynInst->computeUnit()->
requestorId(), 0,
gpuDynInst->wfDynId);
gpuDynInst->setRequestFlags(req);
gpuDynInst->computeUnit()->scalarMemoryPipe.
injectScalarMemFence(gpuDynInst, false, req);
} // execute
// --- Inst_SOPP__S_INCPERFLEVEL class methods ---

View File

@@ -294,6 +294,7 @@ class Shader(ClockedObject):
dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher")
system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)")
n_wf = Param.Int(10, "Number of wavefront slots per SIMD")
cu_per_sqc = Param.Int(4, "Number of CUs that share an SQC")
impl_kern_launch_acq = Param.Bool(
True,
"""Insert acq packet into

View File

@@ -397,9 +397,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
}
/**
* trigger invalidate operation in the cu
* trigger invalidate operation in the CU
*
* req: request initialized in shader, carrying the invlidate flags
* req: request initialized in shader, carrying the invalidate flags
*/
void
ComputeUnit::doInvalidate(RequestPtr req, int kernId){
@@ -425,6 +425,26 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
injectGlobalMemFence(gpuDynInst, true);
}
/**
* trigger SQCinvalidate operation in the CU
*
* req: request initialized in shader, carrying the invalidate flags
*/
void
ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){
GPUDynInstPtr gpuDynInst
= std::make_shared<GPUDynInst>(this, nullptr,
new KernelLaunchStaticInst(), getAndIncSeqNum());
// kern_id will be used in inv responses
gpuDynInst->kern_id = kernId;
// update contextId field
req->setContext(gpuDynInst->wfDynId);
gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
}
// reseting SIMD register pools
// I couldn't think of any other place and
// I think it is needed in my implementation
@@ -1012,7 +1032,14 @@ ComputeUnit::DataPort::recvReqRetry()
bool
ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
{
computeUnit->handleSQCReturn(pkt);
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
/** Process the response only if there is a wavefront associated with it.
* Otherwise, it is from SQC invalidate that was issued at kernel start
* and doesn't have a wavefront or instruction associated with it.
*/
if (sender_state->wavefront != nullptr) {
computeUnit->handleSQCReturn(pkt);
}
return true;
}
@@ -1046,6 +1073,26 @@ ComputeUnit::SQCPort::recvReqRetry()
}
}
const char*
ComputeUnit::SQCPort::MemReqEvent::description() const
{
return "ComputeUnit SQC memory request event";
}
void
ComputeUnit::SQCPort::MemReqEvent::process()
{
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
[[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
assert(!pkt->req->systemReq());
if (!(sqcPort.sendTimingReq(pkt))) {
sqcPort.retries.push_back(std::pair<PacketPtr, Wavefront*>
(pkt, sender_state->wavefront));
}
}
void
ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
{

View File

@@ -412,6 +412,7 @@ class ComputeUnit : public ClockedObject
void doInvalidate(RequestPtr req, int kernId);
void doFlush(GPUDynInstPtr gpuDynInst);
void doSQCInvalidate(RequestPtr req, int kernId);
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
@@ -680,6 +681,23 @@ class ComputeUnit : public ClockedObject
kernId(_kernId){ }
};
class MemReqEvent : public Event
{
private:
SQCPort &sqcPort;
PacketPtr pkt;
public:
MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
: Event(), sqcPort(_sqc_port), pkt(_pkt)
{
setFlags(Event::AutoDelete);
}
void process();
const char *description() const;
};
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
protected:

View File

@@ -320,7 +320,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
wavefront->dropFetch = false;
} else {
fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt);
}
wavefront->pendingFetch = false;
@@ -469,8 +469,23 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
}
void
FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
FetchUnit::FetchBufDesc::fetchDone(PacketPtr pkt)
{
// If the return command is MemSyncResp, then it belongs to
// an SQC invalidation request. This request calls
// incLGKMInstsIssued() function in its execution path.
// Since there is no valid memory return response associated with
// this instruction, decLGKMInstsIssued() is not executed. Do this
// here to decrement the counter and invalidate all buffers
if (pkt->cmd == MemCmd::MemSyncResp) {
wavefront->decLGKMInstsIssued();
flushBuf();
restartFromBranch = false;
return;
}
Addr vaddr = pkt->req->getVaddr();
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
wavefront->simdId, wavefront->wfSlotId,

View File

@@ -138,7 +138,7 @@ class FetchUnit
return is_reserved;
}
void fetchDone(Addr vaddr);
void fetchDone(PacketPtr ptr);
/**
* checks if the buffer contains valid data. this essentially

View File

@@ -160,4 +160,55 @@ ScalarMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
issuedRequests.push(gpuDynInst);
}
void
ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
bool kernelMemSync,
RequestPtr req)
{
assert(gpuDynInst->isScalar());
if (!req) {
req = std::make_shared<Request>(
0, 0, 0, computeUnit.requestorId(), 0, gpuDynInst->wfDynId);
} else {
req->requestorId(computeUnit.requestorId());
}
// When the SQC invalidate instruction is executed, it calls
// injectScalarMemFence. The instruction does not contain an address
// as one of its operands. Therefore, set the physical address of the
// invalidation request to 0 and handle it in the sequencer
req->setPaddr(0);
PacketPtr pkt = nullptr;
// If kernelMemSync is true, then the invalidation request is from
// kernel launch and is an implicit invalidation.If false, then it is
// due to an S_ICACHE_INV instruction
if (kernelMemSync) {
req->setCacheCoherenceFlags(Request::INV_L1);
req->setReqInstSeqNum(gpuDynInst->seqNum());
req->setFlags(Request::KERNEL);
pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
new ComputeUnit::SQCPort::SenderState(
gpuDynInst->wavefront(), nullptr));
} else {
gpuDynInst->setRequestFlags(req);
req->setReqInstSeqNum(gpuDynInst->seqNum());
pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
new ComputeUnit::SQCPort::SenderState(
gpuDynInst->wavefront(), nullptr));
}
ComputeUnit::SQCPort::MemReqEvent *sqc_event =
new ComputeUnit::SQCPort::MemReqEvent
(computeUnit.sqcPort, pkt);
computeUnit.schedule(
sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
}
} // namespace gem5

View File

@@ -36,6 +36,7 @@
#include <string>
#include "gpu-compute/misc.hh"
#include "mem/request.hh"
#include "params/ComputeUnit.hh"
#include "sim/stats.hh"
@@ -67,6 +68,9 @@ class ScalarMemPipeline
void issueRequest(GPUDynInstPtr gpuDynInst);
void injectScalarMemFence(
GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req);
bool
isGMLdRespFIFOWrRdy() const
{

View File

@@ -64,6 +64,7 @@ Shader::Shader(const Params &p) : ClockedObject(p),
impl_kern_end_rel(p.impl_kern_end_rel),
coissue_return(1),
trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
n_cu_per_sqc(p.cu_per_sqc),
globalMemSize(p.globalmem),
nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
_dispatcher(*p.dispatcher), systemHub(p.system_hub),
@@ -221,6 +222,13 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
// all necessary INV flags are all set now, call cu to execute
cuList[i_cu]->doInvalidate(req, task->dispatchId());
// A set of CUs share a single SQC cache. Send a single invalidate
// request to each SQC
if ((i_cu % n_cu_per_sqc) == 0) {
cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
}
// I don't like this. This is intrusive coding.
cuList[i_cu]->resetRegisterPool();
}

View File

@@ -237,6 +237,8 @@ class Shader : public ClockedObject
int n_cu;
// Number of wavefront slots per SIMD per CU
int n_wf;
//Number of cu units per sqc in the shader
int n_cu_per_sqc;
// The size of global memory
int globalMemSize;

View File

@@ -60,6 +60,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
// Mem sys initiated
Repl, desc="Replacing block from cache";
Data, desc="Received Data";
Evict, desc="Evict cache line";
}
enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -67,6 +68,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
DataArrayWrite, desc="Write the data array";
TagArrayRead, desc="Read the data array";
TagArrayWrite, desc="Write the data array";
TagArrayFlash, desc="Flash clear the data array";
}
@@ -242,7 +244,12 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
Entry cache_entry := getCacheEntry(in_msg.LineAddress);
TBE tbe := TBEs.lookup(in_msg.LineAddress);
trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
DPRINTF(RubySlicc, "%s\n", in_msg);
if (in_msg.Type == RubyRequestType:REPLACEMENT) {
trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
} else {
trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
}
}
}
}
@@ -313,6 +320,10 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
}
action(inv_invDone, "inv", desc="local inv done") {
sequencer.invL1Callback();
}
action(w_writeCache, "w", desc="write data to cache") {
peek(responseToSQC_in, ResponseMsg) {
assert(is_valid(cache_entry));
@@ -350,6 +361,13 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
ic_invCache;
}
transition({I, IV, V}, Evict, I) {TagArrayRead, TagArrayWrite} {
// since we're evicting something, don't bother classifying as hit/miss
ic_invCache;
inv_invDone;
p_popMandatoryQueue;
}
// if we got a response for a load where the line is in I, then
// another request must have come in that replaced the line in question in
// the cache. Thus, complete this request without allocating the line, but

View File

@@ -157,6 +157,9 @@ structure (Sequencer, external = "yes") {
void llscClearLocalMonitor();
void evictionCallback(Addr);
void invL1Callback();
void recordRequestType(SequencerRequestType);
bool checkResourceAvailable(CacheResourceType, Addr);
}

View File

@@ -85,6 +85,8 @@ Sequencer::Sequencer(const Params &p)
m_runningGarnetStandalone = p.garnet_standalone;
m_num_pending_invs = 0;
m_cache_inv_pkt = nullptr;
// These statistical variables are not for display.
// The profiler will collate these across different
@@ -348,6 +350,15 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
return RequestStatus_Ready;
}
// If command is MemSyncReq, it is used to invalidate the cache.
// As the cache invalidation requests are already issued in invL1(),
// there is no need to create a new request for the same here.
// Instead, return RequestStatus_Aliased, and make the sequencer skip
// an extra issueRequest
if (pkt->cmd == MemCmd::MemSyncReq) {
return RequestStatus_Aliased;
}
Addr line_addr = makeLineAddress(pkt->getAddr());
// Check if there is any outstanding request for the same cache line.
auto &seq_req_list = m_RequestTable[line_addr];
@@ -576,7 +587,8 @@ Sequencer::readCallback(Addr address, DataBlock& data,
}
if ((seq_req.m_type != RubyRequestType_LD) &&
(seq_req.m_type != RubyRequestType_Load_Linked) &&
(seq_req.m_type != RubyRequestType_IFETCH)) {
(seq_req.m_type != RubyRequestType_IFETCH) &&
(seq_req.m_type != RubyRequestType_REPLACEMENT)) {
// Write request: reissue request to the cache hierarchy
issueRequest(seq_req.pkt, seq_req.m_second_type);
break;
@@ -811,6 +823,86 @@ Sequencer::unaddressedCallback(Addr unaddressedReqId,
}
}
void
Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
{
for (auto& pkt : mylist) {
// When Ruby is in warmup or cooldown phase, the requests come
// from the cache recorder. They do not track which port to use
// and do not need to send the response back
if (!RubySystem::getWarmupEnabled()
&& !RubySystem::getCooldownEnabled()) {
RubyPort::SenderState *ss =
safe_cast<RubyPort::SenderState *>(pkt->senderState);
MemResponsePort *port = ss->port;
assert(port != NULL);
pkt->senderState = ss->predecessor;
if (pkt->cmd != MemCmd::WriteReq) {
// for WriteReq, we keep the original senderState until
// writeCompleteCallback
delete ss;
}
port->hitCallback(pkt);
trySendRetries();
}
}
RubySystem *rs = m_ruby_system;
if (RubySystem::getWarmupEnabled()) {
rs->m_cache_recorder->enqueueNextFetchRequest();
} else if (RubySystem::getCooldownEnabled()) {
rs->m_cache_recorder->enqueueNextFlushRequest();
} else {
testDrainComplete();
}
}
void
Sequencer::invL1Callback()
{
// Since L1 invalidate is currently done with paddr = 0
assert(m_cache_inv_pkt && m_num_pending_invs > 0);
m_num_pending_invs--;
if (m_num_pending_invs == 0) {
std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
m_cache_inv_pkt = nullptr;
completeHitCallback(pkt_list);
}
}
void
Sequencer::invL1()
{
int size = m_dataCache_ptr->getNumBlocks();
DPRINTF(RubySequencer,
"There are %d Invalidations outstanding before Cache Walk\n",
m_num_pending_invs);
// Walk the cache
for (int i = 0; i < size; i++) {
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
// Evict Read-only data
RubyRequestType request_type = RubyRequestType_REPLACEMENT;
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
clockEdge(), addr, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr);
assert(m_mandatory_q_ptr != NULL);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(request_type));
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
m_num_pending_invs++;
}
DPRINTF(RubySequencer,
"There are %d Invalidations outstanding after Cache Walk\n",
m_num_pending_invs);
}
bool
Sequencer::empty() const
{
@@ -915,6 +1007,11 @@ Sequencer::makeRequest(PacketPtr pkt)
}
} else if (pkt->isFlush()) {
primary_type = secondary_type = RubyRequestType_FLUSH;
} else if (pkt->cmd == MemCmd::MemSyncReq) {
primary_type = secondary_type = RubyRequestType_REPLACEMENT;
assert(!m_cache_inv_pkt);
m_cache_inv_pkt = pkt;
invL1();
} else {
panic("Unsupported ruby packet type\n");
}

View File

@@ -141,6 +141,10 @@ class Sequencer : public RubyPort
const Cycles forwardRequestTime = Cycles(0),
const Cycles firstResponseTime = Cycles(0));
void completeHitCallback(std::vector<PacketPtr>& list);
void invL1Callback();
void invL1();
RequestStatus makeRequest(PacketPtr pkt) override;
virtual bool empty() const;
int outstandingCount() const override { return m_outstanding_count; }
@@ -243,6 +247,10 @@ class Sequencer : public RubyPort
private:
int m_max_outstanding_requests;
int m_num_pending_invs;
PacketPtr m_cache_inv_pkt;
CacheMemory* m_dataCache_ptr;
// The cache access latency for top-level caches (L0/L1). These are