diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm index 000cf5b1e6..bdc5d73f20 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm @@ -48,6 +48,9 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") { state_declaration(State, desc="SQC Cache States", default="SQC_State_I") { I, AccessPermission:Invalid, desc="Invalid"; + // Note: currently IV in the TCP is only for pending loads to a given cache + // line. Since the SQC is read only, there are no stores. + IV, AccessPermission:Invalid, desc="Going from I to V, waiting on TCC data"; V, AccessPermission:Read_Only, desc="Valid"; } @@ -98,6 +101,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") void unset_tbe(); void wakeUpAllBuffers(); void wakeUpBuffers(Addr a); + void wakeUpAllBuffers(Addr a); Cycles curCycle(); // Internal functions @@ -270,6 +274,21 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") } } + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(mandatoryQueue_in, address); + } + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { mandatoryQueue_in.dequeue(clockEdge()); } @@ -278,6 +297,10 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") responseToSQC_in.dequeue(clockEdge()); } + action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") { + wakeUpAllBuffers(address); + } + action(l_loadDoneHit, "ldh", desc="local load done (hits in SQC)") { assert(is_valid(cache_entry)); sequencer.readCallback(address, cache_entry.DataBlk, true, MachineType:L1Cache); @@ -313,22 +336,52 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") // Transitions + // if another request arrives for the same cache line that has a pending + // load, put it on the wakeup buffer. This reduced resource contention since + // they won't try again every cycle and will instead only try again once woken + // up + transition(IV, {Fetch}) { + st_stallAndWaitRequest; + } + // transitions from base - transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} { + transition({I, IV, V}, Repl, I) {TagArrayRead, TagArrayWrite} { // since we're evicting something, don't bother classifying as hit/miss ic_invCache; } - transition(I, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} { + // if we got a response for a load where the line is in I, then + // another request must have come in that replaced the line in question in + // the cache. Thus, complete this request without allocating the line, but + // still deallocate TBE and wakeup any dependent addresses. + transition(I, Data) {TagArrayRead, TagArrayWrite, DataArrayRead} { + // don't profile this as a hit/miss since it's a reponse from L2, + // so we already counted it + l_loadDoneMiss; + wada_wakeUpAllDependentsAddr; + d_deallocateTBE; + pr_popResponseQueue; + } + + // if line is currently in IV, then Data is returning the data for a + // pending load, so transition to V, deallocate TBE, and wakeup any dependent + // requests so they will be replayed now that this request has returned. + transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} { a_allocate; // don't profile this as a hit/miss since it's a reponse from L2, // so we already counted it w_writeCache; l_loadDoneMiss; + wada_wakeUpAllDependentsAddr; + d_deallocateTBE; pr_popResponseQueue; } - transition(I, Fetch) {TagArrayRead, TagArrayWrite} { + // if we have a load that misses, allocate TBE entry and transition to IV + // to prevent subsequent requests to same cache line from also going to TCC + // while this request is pending + transition(I, Fetch, IV) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; nS_issueRdBlkS; uu_profileDataMiss; // since line wasn't in SQC, we missed p_popMandatoryQueue; diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index 7cb3a00e26..14c9c8c1cc 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -61,6 +61,7 @@ machine(MachineType:TCC, "TCC Cache") WrVicBlk, desc="L1 Write Through"; WrVicBlkBack, desc="L1 Write Through(dirty cache)"; WrVicBlkEvict, desc="L1 Write Through(dirty cache) and evict"; + AtomicWait, desc="Atomic Op that must wait for pending loads"; Atomic, desc="Atomic Op"; AtomicPassOn, desc="Atomic Op Passed on to Directory"; AtomicDone, desc="AtomicOps Complete"; @@ -113,6 +114,7 @@ machine(MachineType:TCC, "TCC Cache") bool Shared, desc="Victim hit by shared probe"; MachineID From, desc="Waiting for writeback from..."; NetDest Destination, desc="Data destination"; + int numPending, desc="num pending requests"; int numPendingDirectoryAtomics, desc="number of pending atomics to be performed in directory"; int atomicDoneCnt, desc="number AtomicDones triggered"; bool isGLCSet, desc="Bypass L1 Cache"; @@ -293,11 +295,14 @@ machine(MachineType:TCC, "TCC Cache") peek(responseFromNB_in, ResponseMsg, block_on="addr") { TBE tbe := TBEs.lookup(in_msg.addr); Entry cache_entry := getCacheEntry(in_msg.addr); - bool is_slc_set := false; - - if (!is_invalid(tbe)) { - is_slc_set := tbe.isSLCSet; - } + /* + MOESI_AMD_Base-dir acts as the directory, and it always passes + SLC information back to L2 because of races at L2 with requests + from different CUs sending requests to same cache line in parallel. + If these requests have different GLC/SLC settings, the L2 TBE may + not have the correct GLC/SLC information for a given request. + */ + bool is_slc_set := in_msg.isSLCSet; // Whether the SLC bit is set or not, WB acks should invoke the // WBAck event. For cases where a read response will follow a @@ -372,16 +377,29 @@ machine(MachineType:TCC, "TCC Cache") } else if (in_msg.Type == CoherenceRequestType:Atomic || in_msg.Type == CoherenceRequestType:AtomicReturn || in_msg.Type == CoherenceRequestType:AtomicNoReturn) { - // If the request is system-level, if the address isn't in the cache, - // or if this cache is write-through, then send the request to the - // directory. Since non-SLC atomics won't be performed by the directory, - // TCC will perform the atomic on the return path on Event:Data. - // The action will invalidate the cache line if SLC is set and the address is - // in the cache. - if(in_msg.isSLCSet || !WB) { - trigger(Event:AtomicPassOn, in_msg.addr, cache_entry, tbe); + /* + If there are pending requests for this line already and those + requests are not atomics, because we can't easily differentiate + between different request types on return and because decrementing + the atomic count assumes all returned requests in the A state are + atomics, we will need to put this atomic to sleep and wake it up + when the loads return. + */ + if (is_valid(tbe) && (tbe.numPending > 0) && + (tbe.numPendingDirectoryAtomics == 0)) { + trigger(Event:AtomicWait, in_msg.addr, cache_entry, tbe); } else { - trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); + // If the request is system-level, if the address isn't in the cache, + // or if this cache is write-through, then send the request to the + // directory. Since non-SLC atomics won't be performed by the directory, + // TCC will perform the atomic on the return path on Event:Data. + // The action will invalidate the cache line if SLC is set and the address is + // in the cache. + if(in_msg.isSLCSet || !WB) { + trigger(Event:AtomicPassOn, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); + } } } else if (in_msg.Type == CoherenceRequestType:RdBlk) { if (in_msg.isSLCSet) { @@ -433,24 +451,35 @@ machine(MachineType:TCC, "TCC Cache") out_msg.addr := address; out_msg.Type := CoherenceResponseType:TDSysResp; out_msg.Sender := machineID; - out_msg.Destination := tbe.Destination; - out_msg.DataBlk := cache_entry.DataBlk; out_msg.MessageSize := MessageSizeType:Response_Data; out_msg.Dirty := false; out_msg.State := CoherenceState:Shared; - DPRINTF(RubySlicc, "%s\n", out_msg); peek(responseFromNB_in, ResponseMsg) { - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; + // if line state is Invalid, then we must be doing the transition(I, Data) + // so use the DataBlk from the incoming message + if ((getAccessPermission(address) == AccessPermission:NotPresent) || + (getAccessPermission(address) == AccessPermission:Invalid)) { + out_msg.DataBlk := in_msg.DataBlk; + } else { + out_msg.DataBlk := cache_entry.DataBlk; + } + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + // reuse CURequestor field to allow multiple concurrent loads and + // track where they should go back to (since TBE can't distinguish + // destinations) + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.CURequestor); } + DPRINTF(RubySlicc, "%s\n", out_msg); } enqueue(unblockToNB_out, UnblockMsg, 1) { out_msg.addr := address; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Unblock_Control; peek(responseFromNB_in, ResponseMsg) { - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } DPRINTF(RubySlicc, "%s\n", out_msg); } @@ -462,13 +491,17 @@ machine(MachineType:TCC, "TCC Cache") out_msg.addr := address; out_msg.Type := CoherenceResponseType:TDSysResp; out_msg.Sender := machineID; - out_msg.Destination := tbe.Destination; + // reuse CURequestor field to allow multiple concurrent loads and + // track where they should go back to (since TBE can't distinguish + // destinations) + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.CURequestor); out_msg.DataBlk := in_msg.DataBlk; out_msg.MessageSize := MessageSizeType:Response_Data; out_msg.Dirty := false; out_msg.State := CoherenceState:Shared; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } enqueue(unblockToNB_out, UnblockMsg, 1) { @@ -481,19 +514,25 @@ machine(MachineType:TCC, "TCC Cache") } action(rd_requestData, "r", desc="Miss in L2, pass on") { - if(tbe.Destination.count()==1){ - peek(coreRequestNetwork_in, CPURequestMsg) { - enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { - out_msg.addr := address; - out_msg.Type := in_msg.Type; - out_msg.Requestor := machineID; - out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); - out_msg.Shared := false; // unneeded for this request - out_msg.MessageSize := in_msg.MessageSize; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; - DPRINTF(RubySlicc, "%s\n", out_msg); - } + peek(coreRequestNetwork_in, CPURequestMsg) { + DPRINTF(RubySlicc, "in_msg: %s\n", in_msg); + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := machineID; + /* + To allow multiple concurrent requests from different CUs, we pass + the orgin information along to the directory, which stores it in its + TBE as appropriate before passing it back to the TCC on the return + path. + */ + out_msg.CURequestor := in_msg.Requestor; + out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + DPRINTF(RubySlicc, "out_msg: %s\n", out_msg); } } } @@ -504,7 +543,7 @@ machine(MachineType:TCC, "TCC Cache") out_msg.addr := address; out_msg.Type := CoherenceResponseType:TDSysWBAck; out_msg.Destination.clear(); - out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Destination.add(in_msg.CURequestor); out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; out_msg.instSeqNum := in_msg.instSeqNum; @@ -562,7 +601,7 @@ machine(MachineType:TCC, "TCC Cache") enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:TDSysResp; - out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Destination.add(in_msg.CURequestor); out_msg.Sender := machineID; out_msg.MessageSize := in_msg.MessageSize; out_msg.DataBlk := cache_entry.DataBlk; @@ -578,12 +617,12 @@ machine(MachineType:TCC, "TCC Cache") enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { out_msg.addr := address; out_msg.Type := CoherenceResponseType:TDSysResp; - out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Destination.add(in_msg.CURequestor); out_msg.Sender := machineID; out_msg.MessageSize := in_msg.MessageSize; out_msg.DataBlk := in_msg.DataBlk; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -611,7 +650,10 @@ machine(MachineType:TCC, "TCC Cache") tbe.Destination.clear(); tbe.numPendingDirectoryAtomics := 0; tbe.atomicDoneCnt := 0; + tbe.numPending := 0; } + // each pending requests increments this count by 1 + tbe.numPending := tbe.numPending + 1; if (coreRequestNetwork_in.isReady(clockEdge())) { peek(coreRequestNetwork_in, CPURequestMsg) { if(in_msg.Type == CoherenceRequestType:RdBlk || @@ -620,6 +662,16 @@ machine(MachineType:TCC, "TCC Cache") in_msg.Type == CoherenceRequestType:AtomicNoReturn){ tbe.Destination.add(in_msg.Requestor); } + /* + If there are multiple concurrent requests to the same cache line, each + one will overwrite the previous ones GLC/SLC information here. + If these requests have different GLC/SLC information, this causes + a segfault. Hence, currently the support relies on the directory to + pass back the GLC/SLC information instead of relying on L2 TBE to be + correct. + + This message is left here as an FYI for future developers. + */ tbe.isGLCSet := in_msg.isGLCSet; tbe.isSLCSet := in_msg.isSLCSet; if(in_msg.Type == CoherenceRequestType:Atomic || @@ -633,9 +685,14 @@ machine(MachineType:TCC, "TCC Cache") } action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") { - tbe.Destination.clear(); - TBEs.deallocate(address); - unset_tbe(); + // since we may have multiple destinations, can't deallocate if we aren't + // last one + tbe.numPending := tbe.numPending - 1; + if (tbe.numPending == 0) { + tbe.Destination.clear(); + TBEs.deallocate(address); + unset_tbe(); + } } action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") { @@ -672,7 +729,7 @@ machine(MachineType:TCC, "TCC Cache") enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { out_msg.addr := address; out_msg.Requestor := machineID; - out_msg.WTRequestor := in_msg.Requestor; + out_msg.CURequestor := in_msg.Requestor; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Data; out_msg.Type := CoherenceRequestType:WriteThrough; @@ -680,6 +737,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.DataBlk := in_msg.DataBlk; out_msg.writeMask.orMask(in_msg.writeMask); out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -688,7 +747,7 @@ machine(MachineType:TCC, "TCC Cache") enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { out_msg.addr := address; out_msg.Requestor := machineID; - out_msg.WTRequestor := machineID; + out_msg.CURequestor := machineID; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Data; out_msg.Type := CoherenceRequestType:WriteThrough; @@ -703,13 +762,15 @@ machine(MachineType:TCC, "TCC Cache") enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { out_msg.addr := address; out_msg.Requestor := machineID; - out_msg.WTRequestor := in_msg.Requestor; + out_msg.CURequestor := in_msg.Requestor; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Data; out_msg.Type := CoherenceRequestType:WriteFlush; out_msg.Dirty := true; out_msg.DataBlk := cache_entry.DataBlk; out_msg.writeMask.orMask(cache_entry.writeMask); + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -719,7 +780,7 @@ machine(MachineType:TCC, "TCC Cache") enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { out_msg.addr := address; out_msg.Requestor := machineID; - out_msg.WTRequestor := in_msg.Requestor; + out_msg.CURequestor := in_msg.Requestor; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Data; out_msg.Type := in_msg.Type; @@ -768,9 +829,17 @@ machine(MachineType:TCC, "TCC Cache") wakeUpAllBuffers(address); } + /* + Currently z_stall is unused because it can lead to Protocol Stalls that + eventually lead to deadlock. Instead, it is recommended to use + st_stallAndWaitRequest in combination with a wakeupBuffer call (e.g., + wada_wakeUpAllDependentsAddr) to put the pending requests to sleep instead of + them causing head of line blocking -- wada_wakeUpAllDependentsAddr should wake + the request up once the request preventing it from completing is done. action(z_stall, "z", desc="stall") { // built-in } + */ action(inpa_incrementNumPendingDirectoryAtomics, "inpa", desc="inc num atomics") { @@ -792,8 +861,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.addr := address; out_msg.Type := TriggerType:AtomicDone; peek(responseFromNB_in, ResponseMsg) { - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -832,31 +901,53 @@ machine(MachineType:TCC, "TCC Cache") // they can cause a resource stall deadlock! transition(WI, {RdBlk, WrVicBlk, Atomic, AtomicPassOn, WrVicBlkBack}) { //TagArrayRead} { - // by putting the stalled requests in a buffer, we reduce resource contention - // since they won't try again every cycle and will instead only try again once - // woken up + // don't profile as hit or miss since it will be tried again + /* + By putting the stalled requests in a buffer, we reduce resource contention + since they won't try again every cycle and will instead only try again once + woken up. + */ st_stallAndWaitRequest; } transition(WIB, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} { - // by putting the stalled requests in a buffer, we reduce resource contention - // since they won't try again every cycle and will instead only try again once - // woken up + // don't profile as hit or miss since it will be tried again + /* + By putting the stalled requests in a buffer, we reduce resource contention + since they won't try again every cycle and will instead only try again once + woken up. + */ st_stallAndWaitRequest; } transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} { - // by putting the stalled requests in a buffer, we reduce resource contention - // since they won't try again every cycle and will instead only try again once - // woken up + // don't profile as hit or miss since it will be tried again + /* + By putting the stalled requests in a buffer, we reduce resource contention + since they won't try again every cycle and will instead only try again once + woken up. + */ st_stallAndWaitRequest; } transition(IV, {WrVicBlk, Atomic, AtomicPassOn, WrVicBlkBack}) { //TagArrayRead} { - // by putting the stalled requests in a buffer, we reduce resource contention - // since they won't try again every cycle and will instead only try again once - // woken up + // don't profile as hit or miss since it will be tried again + /* + By putting the stalled requests in a buffer, we reduce resource contention + since they won't try again every cycle and will instead only try again once + woken up. + */ st_stallAndWaitRequest; } + transition({I, IV, V}, AtomicWait) { + // don't profile as hit or miss since it will be tried again + /* + By putting the stalled requests in a buffer, we reduce resource contention + since they won't try again every cycle and will instead only try again once + woken up. + */ + st_stallAndWaitRequest; + } + transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} { p_profileHit; sd_sendData; @@ -865,12 +956,15 @@ machine(MachineType:TCC, "TCC Cache") } transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} { + // don't profile as hit or miss since it will be tried again t_allocateTBE; wb_writeBack; - // need to try this request again after writing back the current entry -- to - // do so, put it with other stalled requests in a buffer to reduce resource - // contention since they won't try again every cycle and will instead only - // try again once woken up + /* + Need to try this request again after writing back the current entry -- to + do so, put it with other stalled requests in a buffer to reduce resource + contention since they won't try again every cycle and will instead only + try again once woken up. + */ st_stallAndWaitRequest; } @@ -933,6 +1027,7 @@ machine(MachineType:TCC, "TCC Cache") // Transition to be called when a read request with SLC flag arrives at entry // in transient state. The request stalls until the pending transition is complete. transition({WI, WIB, IV}, RdBypassEvict) { + // don't profile as hit or miss since it will be tried again st_stallAndWaitRequest; } @@ -945,8 +1040,8 @@ machine(MachineType:TCC, "TCC Cache") p_popRequestQueue; } - transition(A, Atomic) { - p_profileMiss; + transition(A, {Atomic, AtomicWait}) { + // don't profile as hit or miss since it will be tried again // by putting the stalled requests in a buffer, we reduce resource contention // since they won't try again every cycle and will instead only try again once // woken up @@ -993,7 +1088,7 @@ machine(MachineType:TCC, "TCC Cache") } transition(A, AtomicPassOn) { - p_profileMiss; + // don't profile as hit or miss since it will be tried again // by putting the stalled requests in a buffer, we reduce resource contention // since they won't try again every cycle and will instead only try again once // woken up @@ -1136,9 +1231,41 @@ machine(MachineType:TCC, "TCC Cache") ut_updateTag; wcb_writeCacheBlock; sdr_sendDataResponse; - pr_popResponseQueue; wada_wakeUpAllDependentsAddr; dt_deallocateTBE; + pr_popResponseQueue; + } + + /* + Since the L2 now allows multiple loads from different CUs to proceed in + parallel to the directory, we may get Event:Data back when the line is + already in V. In this case, send the response to the appropriate TCP + and update MRU/data in TCC, but don't need to allocate line. + */ + transition(V, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + wcb_writeCacheBlock; + sdr_sendDataResponse; + wada_wakeUpAllDependentsAddr; + // tracks # pending requests, so need to decrement here too + dt_deallocateTBE; + pr_popResponseQueue; + } + + /* + Since the L2 now allows multiple loads from different CUs to proceed in + parallel to the directory, we may get Event:Data back when the line is + now in I because it has been evicted by an intervening request to the same + set index. In this case, send the response to the appropriate TCP without + affecting the TCC (essentially, treat it similar to a bypass request except + we also send the unblock back to the directory). + */ + transition(I, Data) { + sdr_sendDataResponse; + wada_wakeUpAllDependentsAddr; + // tracks # pending requests, so need to decrement here too + dt_deallocateTBE; + pr_popResponseQueue; } transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} { diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index ae35d4c5f7..97997a12b5 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -1,5 +1,6 @@ /* * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * Copyright (c) 2023 Matthew D. Sinclair * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -53,10 +54,14 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") { state_declaration(State, desc="TCP Cache States", default="TCP_State_I") { I, AccessPermission:Invalid, desc="Invalid"; + // Note: currently IV in the TCP is only for pending loads to a given cache + // line. Since the TCP is write through, stores should be allowed to pass + // through without requiring them to wait. + IV, AccessPermission:Invalid, desc="Going from I to V, waiting on TCC data"; V, AccessPermission:Read_Only, desc="Valid"; A, AccessPermission:Invalid, desc="Waiting on Atomic"; - F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack"; + F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack"; } enumeration(Event, desc="TCP Events") { @@ -102,6 +107,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") bool Dirty, desc="Is the data dirty (different than memory)?"; int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for"; bool Shared, desc="Victim hit by shared probe"; + bool isGLCSet, desc="Bypass L1 Cache"; + bool isSLCSet, desc="Bypass L1 and L2 Cache"; } structure(TBETable, external="yes") { @@ -123,6 +130,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") void unset_tbe(); void wakeUpAllBuffers(); void wakeUpBuffers(Addr a); + void wakeUpAllBuffers(Addr a); Cycles curCycle(); // Internal functions @@ -472,6 +480,15 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") check_allocate(TBEs); TBEs.allocate(address); set_tbe(TBEs.lookup(address)); + + // pass GLC/SLC information along + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest) { + DPRINTF(RubySlicc, "Address: %p\n", address); + tbe.isGLCSet := in_msg.isGLCSet; + tbe.isSLCSet := in_msg.isSLCSet; + } + } } action(d_deallocateTBE, "d", desc="Deallocate TBE") { @@ -510,6 +527,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") responseToTCP_in.dequeue(clockEdge()); } + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(mandatoryQueue_in, address); + } + action(l_loadDoneHit, "ldh", desc="local load done (hits in TCP)") { assert(is_valid(cache_entry)); if (use_seq_not_coal) { @@ -528,6 +549,20 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } } + action(ldmi_loadDoneMissInv, "ldmi", + desc="local load done (misses in TCP and line was evicted)") { + // since line was evicted, can't rely on data from cache entry, so use from + // the response message + peek(responseToTCP_in, ResponseMsg) { + DataBlock tmp:= in_msg.DataBlk; + if (use_seq_not_coal) { + sequencer.readCallback(address, tmp, false, MachineType:L1Cache); + } else { + coalescer.readCallback(address, MachineType:L1Cache, tmp); + } + } + } + action(ad_atomicDone, "ad", desc="atomic done") { assert(is_valid(cache_entry)); coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk); @@ -604,6 +639,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") L1cache.setMRU(address); } + action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") { + wakeUpAllBuffers(address); + } + // action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { // mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); // } @@ -632,11 +671,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") // Stalling transitions do NOT check the tag array...and if they do, // they can cause a resource stall deadlock! - transition({A}, {Load, Atomic, StoreThrough}) { //TagArrayRead} { - z_stall; + // if another request arrives for the same cache line that has a pending + // atomic or load, put it on the wakeup buffer instead of z_stall'ing it. By + // doing so we reduce resource contention since they won't try again every cycle + // and will instead only try again once woken up + transition({A, IV}, {Load, LoadBypassEvict, Atomic, Store, StoreThrough, Flush}) { + st_stallAndWaitRequest; } - transition(I, Load) {TagArrayRead} { + // if we have a load that misses, allocate TBE entry and transition to IV + // to prevent subsequent requests to same cache line from also going to TCC + // while this request is pending + transition(I, Load, IV) {TagArrayRead} { + t_allocateTBE; n_issueRdBlk; uu_profileDataMiss; p_popMandatoryQueue; @@ -694,14 +741,38 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") p_popMandatoryQueue; } - transition(I, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} { - a_allocate; - w_writeCache; - l_loadDoneMiss; + // if we got a response for a load where the line is in I, then + // another request must have come in that replaced the line in question in + // the cache. Thus, complete this request without allocating the line, but + // still deallocate TBE and wakeup any dependent addresses. + // (Note: this assumes TCC_AckWB is what stores use) + transition(I, TCC_Ack) {TagArrayRead, TagArrayWrite} { + wada_wakeUpAllDependentsAddr; + // NOTE: Because we invalidated the cache line, the assert in l_loadDoneMiss + // will fail -- unlike atomics that automatically go to I when the line returns + // loads do not automatically go to I. Resolve this by passing data from + // message. + ldmi_loadDoneMissInv; + d_deallocateTBE; pr_popResponseQueue; } - transition(I, Bypass, I) { + // if line is currently in IV, then TCC_Ack is returning the data for a + // pending load, so transition to V, deallocate TBE, and wakeup any dependent + // requests so they will be replayed now that this request has returned. + transition(IV, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} { + a_allocate; + w_writeCache; + wada_wakeUpAllDependentsAddr; + l_loadDoneMiss; + d_deallocateTBE; + pr_popResponseQueue; + } + + // if a bypass request arrives back at the TCP, regardless of whether the line + // is in I (from the bypass request) or IV (from a subsequent non-bypassing + // load), retain the current state and complete the bypassing request. + transition({I, IV}, Bypass) { rb_bypassDone; pr_popResponseQueue; } @@ -713,12 +784,13 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} { - d_deallocateTBE; a_allocate; w_writeCache; ad_atomicDone; - pr_popResponseQueue; ic_invCache; + wada_wakeUpAllDependentsAddr; + d_deallocateTBE; + pr_popResponseQueue; } transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} { @@ -735,20 +807,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") ic_invCache; } + // if a line with a pending load gets evicted, transition the line to I and + // invalidate it. + transition(IV, Repl, I) {TagArrayRead, TagArrayWrite} { + ic_invCache; + } + transition({V,I}, Flush, F) {TagArrayFlash} { a_allocate; sf_setFlush; p_popMandatoryQueue; } - transition(A, Flush) { - z_stall; - } - transition({I, V}, Evict, I) {TagArrayFlash} { inv_invDone; - p_popMandatoryQueue; ic_invCache; + p_popMandatoryQueue; } transition(A, Evict) {TagArrayFlash} { @@ -756,8 +830,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") p_popMandatoryQueue; } + // if a line is in IV and a TCC_AckWB comes back, we must have had a WT + // store followed by a load. Thus, complete the store without affecting + // TBE or line state. // TCC_AckWB only snoops TBE - transition({V, I, A}, TCC_AckWB) { + transition({V, I, IV, A}, TCC_AckWB) { wd_wtDone; pr_popResponseQueue; } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index b9401d680a..c36fc9ec93 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -154,7 +154,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") bool Dirty, desc="Is the data dirty?"; int NumPendingAcks, desc="num acks expected"; MachineID OriginalRequestor, desc="Original Requestor"; - MachineID WTRequestor, desc="WT Requestor"; + MachineID CURequestor, desc="CU that initiated the request"; bool Cached, desc="data hit in Cache"; bool MemData, desc="Got MemData?",default="false"; bool wtData, desc="Got write through data?",default="false"; @@ -170,7 +170,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") uint64_t probe_id, desc="probe id for lifetime profiling"; WriteMask writeMask, desc="outstanding write through mask"; int Len, desc="Length of memory request for DMA"; - bool isSLCSet, desc="Bypass L1 and L2 Cache"; + // GLC is passed along because it is needed in the return path + bool isGLCSet, desc="Bypass GPU L1 Cache"; + bool isSLCSet, desc="Bypass GPU L1 and L2 Cache"; } structure(TBETable, external="yes") { @@ -470,6 +472,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ForwardRequestTime := tbe.ForwardRequestTime; out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; out_msg.OriginalResponder := tbe.LastSender; + out_msg.CURequestor := tbe.CURequestor; out_msg.L3Hit := tbe.L3Hit; DPRINTF(RubySlicc, "%s\n", out_msg); } @@ -498,6 +501,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; out_msg.OriginalResponder := tbe.LastSender; out_msg.L3Hit := tbe.L3Hit; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; + out_msg.CURequestor := tbe.CURequestor; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -527,9 +533,11 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ForwardRequestTime := tbe.ForwardRequestTime; out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; out_msg.OriginalResponder := tbe.LastSender; - if(tbe.atomicData){ - out_msg.WTRequestor := tbe.WTRequestor; - } + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; + if(tbe.atomicData){ + out_msg.CURequestor := tbe.CURequestor; + } out_msg.L3Hit := tbe.L3Hit; DPRINTF(RubySlicc, "%s\n", out_msg); } @@ -555,6 +563,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.InitialRequestTime := tbe.InitialRequestTime; out_msg.ForwardRequestTime := curCycle(); out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -565,13 +575,15 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.addr := address; out_msg.Type := CoherenceResponseType:NBSysWBAck; out_msg.Destination.add(in_msg.Requestor); - out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.CURequestor := in_msg.CURequestor; out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; out_msg.InitialRequestTime := in_msg.InitialRequestTime; out_msg.ForwardRequestTime := curCycle(); out_msg.ProbeRequestStartTime := curCycle(); out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -582,7 +594,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.addr := address; out_msg.Type := CoherenceResponseType:NBSysWBAck; out_msg.Destination.add(tbe.OriginalRequestor); - out_msg.WTRequestor := tbe.WTRequestor; + out_msg.CURequestor := tbe.CURequestor; out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; out_msg.InitialRequestTime := tbe.InitialRequestTime; @@ -773,6 +785,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; tbe.NumPendingAcks := out_msg.Destination.count(); + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); APPEND_TRANSITION_COMMENT(" dc: Acks remaining: "); APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); @@ -877,6 +891,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; tbe.NumPendingAcks := out_msg.Destination.count(); + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; DPRINTF(RubySlicc, "%s\n", (out_msg)); APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); @@ -931,6 +947,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ReturnData := false; out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; tbe.NumPendingAcks := out_msg.Destination.count(); APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); @@ -1017,7 +1035,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") tbe.writeMask.clear(); tbe.writeMask.orMask(in_msg.writeMask); tbe.wtData := true; - tbe.WTRequestor := in_msg.WTRequestor; + tbe.CURequestor := in_msg.CURequestor; tbe.LastSender := in_msg.Requestor; } if (in_msg.Type == CoherenceRequestType:Atomic || @@ -1032,10 +1050,14 @@ machine(MachineType:Directory, "AMD Baseline protocol") assert(in_msg.Type == CoherenceRequestType:AtomicNoReturn); tbe.atomicDataNoReturn := true; } - tbe.WTRequestor := in_msg.WTRequestor; + tbe.CURequestor := in_msg.CURequestor; tbe.LastSender := in_msg.Requestor; tbe.isSLCSet := in_msg.isSLCSet; } + // GPU read requests also need to track where the requestor came from + if (in_msg.Type == CoherenceRequestType:RdBlk) { + tbe.CURequestor := in_msg.CURequestor; + } tbe.Dirty := false; if (in_msg.Type == CoherenceRequestType:WriteThrough) { tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); @@ -1045,6 +1067,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") tbe.NumPendingAcks := 0; tbe.Cached := in_msg.ForceShared; tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.isGLCSet := in_msg.isGLCSet; + tbe.isSLCSet := in_msg.isSLCSet; + DPRINTF(RubySlicc, "t_allocateTBE in_msg: %s, tbe: %s\n", in_msg, tbe.CURequestor); } } @@ -1277,11 +1302,20 @@ machine(MachineType:Directory, "AMD Baseline protocol") } action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") { + DPRINTF(RubySlicc, "wada wakeup: 0x%x\n", address); wakeUpAllBuffers(address); } + /* + Currently z_stall is unused because it can lead to Protocol Stalls that + eventually lead to deadlock. Instead, it is recommended to use + st_stallAndWaitRequest in combination with a wakeupBuffer call (e.g., + wada_wakeUpAllDependentsAddr) to put the pending requests to sleep instead of + them causing head of line blocking -- wada_wakeUpAllDependentsAddr should wake + the request up once the request preventing it from completing is done. action(z_stall, "z", desc="...") { } + */ // TRANSITIONS transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { @@ -1383,19 +1417,19 @@ machine(MachineType:Directory, "AMD Baseline protocol") d_writeDataToMemory; al_allocateL3Block; pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE - wad_wakeUpDependents; + wada_wakeUpAllDependentsAddr; dt_deallocateTBE; pr_popResponseQueue; } transition(BL, StaleWB, U) {L3TagArrayWrite} { dt_deallocateTBE; - wa_wakeUpAllDependents; + wada_wakeUpAllDependentsAddr; pr_popResponseQueue; } transition({B, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { - z_stall; + st_stallAndWaitRequest; } transition({U, BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, WBAck) { diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm index 984362da39..b860ff1681 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm @@ -134,14 +134,14 @@ structure(CPURequestMsg, desc="...", interface="Message") { int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive"; CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer"; WriteMask writeMask, desc="Write Through Data"; - MachineID WTRequestor, desc="Node who initiated the write through"; + MachineID CURequestor, desc="Node who initiated the request"; int wfid, default="0", desc="wavefront id"; uint64_t instSeqNum, desc="instruction sequence number"; bool NoWriteConflict, default="true", desc="write collided with CAB entry"; int ProgramCounter, desc="PC that accesses to this block"; - bool isGLCSet, default="false", desc="GLC flag value in the request"; - bool isSLCSet, default="false", desc="SLC flag value in the request"; + bool isGLCSet, default="false", desc="GLC flag value in the request"; + bool isSLCSet, default="false", desc="SLC flag value in the request"; bool functionalRead(Packet *pkt) { // Only PUTX messages contains the data block @@ -170,6 +170,8 @@ structure(NBProbeRequestMsg, desc="...", interface="Message") { MachineID Requestor, desc="Requestor id for 3-hop requests"; bool NoAckNeeded, default="false", desc="For short circuting acks"; int ProgramCounter, desc="PC that accesses to this block"; + bool isGLCSet, default="false", desc="GLC flag value in the request"; + bool isSLCSet, default="false", desc="SLC flag value in the request"; bool functionalRead(Packet *pkt) { return false; @@ -240,7 +242,7 @@ structure(ResponseMsg, desc="...", interface="Message") { bool L3Hit, default="false", desc="Did memory or L3 supply the data?"; MachineID OriginalResponder, desc="Mach which wrote the data to the L3"; - MachineID WTRequestor, desc="Node who started the writethrough"; + MachineID CURequestor, desc="Node who started the access"; bool NotCached, default="false", desc="True when the Region buffer has already evicted the line"; diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index f9d071ba62..90d6031c6e 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -526,26 +526,16 @@ GPUCoalescer::readCallback(Addr address, fatal_if(crequest->getRubyType() != RubyRequestType_LD, "readCallback received non-read type response\n"); - // Iterate over the coalesced requests to respond to as many loads as - // possible until another request type is seen. Models MSHR for TCP. - while (crequest->getRubyType() == RubyRequestType_LD) { - hitCallback(crequest, mach, data, true, crequest->getIssueTime(), - forwardRequestTime, firstResponseTime, isRegion); - - delete crequest; - coalescedTable.at(address).pop_front(); - if (coalescedTable.at(address).empty()) { - break; - } - - crequest = coalescedTable.at(address).front(); - } + hitCallback(crequest, mach, data, true, crequest->getIssueTime(), + forwardRequestTime, firstResponseTime, isRegion); + delete crequest; + coalescedTable.at(address).pop_front(); if (coalescedTable.at(address).empty()) { - coalescedTable.erase(address); + coalescedTable.erase(address); } else { - auto nextRequest = coalescedTable.at(address).front(); - issueRequest(nextRequest); + auto nextRequest = coalescedTable.at(address).front(); + issueRequest(nextRequest); } }