Files
gem5/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
Matthew Poremba 1d64669473 mem,gpu-compute: Implement GPU TCC directed invalidate
The GPU device currently supports large BAR which means that the driver
can write directly to GPU memory over the PCI bus without using SDMA or
PM4 packets. The gem5 PCI interface only provides an atomic interface
for BAR reads/writes, which means the values cannot go through timing
mode Ruby caches. This causes bugs as the TCC cache is allowed to keep
clean data between kernels for performance reasons. If there is a BAR
write directly to memory bypassing the cache, the value in the cache is
stale and must be invalidated.

In this commit a TCC invalidate is generated for all writes over PCI
that go directly to GPU memory. This will also invalidate TCP along the
way if necessary. This currently relies on the driver synchonization
which only allows BAR writes in between kernels. Therefore, the cache
should only be in I or V state.

To handle a race condition between invalidates and launching the next
kernel, the invalidates return a response and the GPU command processor
will wait for all TCC invalidates to be complete before launching the
next kernel.

This fixes issues with stale data in nanoGPT and possibly PENNANT.

Change-Id: I8e1290f842122682c271e5508a48037055bfbcdf
2024-04-10 11:35:25 -07:00

896 lines
31 KiB
Plaintext

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* Copyright (c) 2023 Matthew D. Sinclair
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Blake Hechtman
*/
machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
: VIPERCoalescer* coalescer;
Sequencer* sequencer;
bool use_seq_not_coal;
CacheMemory * L1cache;
bool WB; /*is this cache Writeback?*/
bool disableL1; /* bypass L1 cache? */
int TCC_select_num_bits;
Cycles issue_latency := 40; // time to send data down to TCC
Cycles l2_hit_latency := 18;
MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
MessageBuffer * mandatoryQueue;
{
state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
I, AccessPermission:Invalid, desc="Invalid";
// Note: currently IV in the TCP is only for pending loads to a given cache
// line. Since the TCP is write through, stores should be allowed to pass
// through without requiring them to wait.
IV, AccessPermission:Invalid, desc="Going from I to V, waiting on TCC data";
V, AccessPermission:Read_Only, desc="Valid";
A, AccessPermission:Invalid, desc="Waiting on Atomic";
F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
}
enumeration(Event, desc="TCP Events") {
// Core initiated
Load, desc="Load";
LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache block already allocated";
Store, desc="Store to L1 (L1 is dirty)";
StoreThrough, desc="Store directly to L2(L1 is clean)";
Atomic, desc="Atomic";
Flush, desc="Flush if dirty(wbL1 for Store Release)";
Evict, desc="Evict if clean(invL1 for Load Acquire)";
// Mem sys initiated
Repl, desc="Replacing block from cache";
InvL2, desc="Invalidate to L2";
InvL2Resp, desc="Invalidate L2 completed";
// TCC initiated
TCC_Ack, desc="TCC Ack to Core Request";
TCC_AckWB, desc="TCC Ack for WB";
// Disable L1 cache
Bypass, desc="Bypass the entire L1 cache";
}
enumeration(RequestType,
desc="To communicate stats from transitions to recordStats") {
DataArrayRead, desc="Read the data array";
DataArrayWrite, desc="Write the data array";
TagArrayRead, desc="Read the data array";
TagArrayWrite, desc="Write the data array";
TagArrayFlash, desc="Flash clear the data array";
}
structure(Entry, desc="...", interface="AbstractCacheEntry") {
State CacheState, desc="cache state";
bool Dirty, desc="Is the data dirty (diff than memory)?";
DataBlock DataBlk, desc="data for the block";
bool FromL2, default="false", desc="block just moved from L2";
WriteMask writeMask, desc="written bytes masks";
}
structure(TBE, desc="...") {
State TBEState, desc="Transient state";
DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
bool Dirty, desc="Is the data dirty (different than memory)?";
int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for";
bool Shared, desc="Victim hit by shared probe";
bool isGLCSet, desc="Bypass L1 Cache";
bool isSLCSet, desc="Bypass L1 and L2 Cache";
}
structure(TBETable, external="yes") {
TBE lookup(Addr);
void allocate(Addr);
void deallocate(Addr);
bool isPresent(Addr);
}
TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
int WTcnt, default="0";
int Fcnt, default="0";
bool inFlush, default="false";
void set_cache_entry(AbstractCacheEntry b);
void unset_cache_entry();
void set_tbe(TBE b);
void unset_tbe();
void wakeUpAllBuffers();
void wakeUpBuffers(Addr a);
void wakeUpAllBuffers(Addr a);
Cycles curCycle();
// Internal functions
Tick clockEdge();
Tick cyclesToTicks(Cycles c);
Entry getCacheEntry(Addr address), return_by_pointer="yes" {
Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
return cache_entry;
}
DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
TBE tbe := TBEs.lookup(addr);
if(is_valid(tbe)) {
return tbe.DataBlk;
} else {
return getCacheEntry(addr).DataBlk;
}
}
State getState(TBE tbe, Entry cache_entry, Addr addr) {
if (is_valid(tbe)) {
return tbe.TBEState;
} else if (is_valid(cache_entry)) {
return cache_entry.CacheState;
}
return State:I;
}
void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
if (is_valid(tbe)) {
tbe.TBEState := state;
}
if (is_valid(cache_entry)) {
cache_entry.CacheState := state;
}
}
void functionalRead(Addr addr, Packet *pkt) {
TBE tbe := TBEs.lookup(addr);
if(is_valid(tbe)) {
testAndRead(addr, tbe.DataBlk, pkt);
} else {
functionalMemoryRead(pkt);
}
}
int functionalWrite(Addr addr, Packet *pkt) {
int num_functional_writes := 0;
TBE tbe := TBEs.lookup(addr);
if(is_valid(tbe)) {
num_functional_writes := num_functional_writes +
testAndWrite(addr, tbe.DataBlk, pkt);
}
num_functional_writes := num_functional_writes +
functionalMemoryWrite(pkt);
return num_functional_writes;
}
AccessPermission getAccessPermission(Addr addr) {
TBE tbe := TBEs.lookup(addr);
if(is_valid(tbe)) {
return TCP_State_to_permission(tbe.TBEState);
}
Entry cache_entry := getCacheEntry(addr);
if(is_valid(cache_entry)) {
return TCP_State_to_permission(cache_entry.CacheState);
}
return AccessPermission:NotPresent;
}
bool isValid(Addr addr) {
AccessPermission perm := getAccessPermission(addr);
if (perm == AccessPermission:NotPresent ||
perm == AccessPermission:Invalid ||
perm == AccessPermission:Busy) {
return false;
} else {
return true;
}
}
void setAccessPermission(Entry cache_entry, Addr addr, State state) {
if (is_valid(cache_entry)) {
cache_entry.changePermission(TCP_State_to_permission(state));
}
}
void recordRequestType(RequestType request_type, Addr addr) {
if (request_type == RequestType:DataArrayRead) {
L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
} else if (request_type == RequestType:DataArrayWrite) {
L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
} else if (request_type == RequestType:TagArrayRead) {
L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
} else if (request_type == RequestType:TagArrayFlash) {
L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
} else if (request_type == RequestType:TagArrayWrite) {
L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
}
}
bool checkResourceAvailable(RequestType request_type, Addr addr) {
if (request_type == RequestType:DataArrayRead) {
return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
} else if (request_type == RequestType:DataArrayWrite) {
return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
} else if (request_type == RequestType:TagArrayRead) {
return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
} else if (request_type == RequestType:TagArrayWrite) {
return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
} else if (request_type == RequestType:TagArrayFlash) {
// FIXME should check once per cache, rather than once per cacheline
return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
} else {
error("Invalid RequestType type in checkResourceAvailable");
return true;
}
}
// Out Ports
out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);
// In Ports
in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
if (responseToTCP_in.isReady(clockEdge())) {
peek(responseToTCP_in, ResponseMsg, block_on="addr") {
Entry cache_entry := getCacheEntry(in_msg.addr);
TBE tbe := TBEs.lookup(in_msg.addr);
DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
// If L1 is disabled or requests have GLC or SLC flag set,
// then, the requests should not cache in the L1. The response
// from L2/global memory should bypass the cache
trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
} else {
Addr victim := L1cache.cacheProbe(in_msg.addr);
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
}
}
} else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
in_msg.Type == CoherenceResponseType:NBSysWBAck) {
trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
} else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
} else {
error("Unexpected Response Message to Core");
}
}
}
}
in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
if (mandatoryQueue_in.isReady(clockEdge())) {
peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
Entry cache_entry := getCacheEntry(in_msg.LineAddress);
TBE tbe := TBEs.lookup(in_msg.LineAddress);
DPRINTF(RubySlicc, "%s\n", in_msg);
if (in_msg.Type == RubyRequestType:LD) {
// Read requests with GLC or SLC bit set should not cache in the L1.
// They need to bypass the L1 and go to the L2. If an entry exists in
// the L1, it needs to be evicted, and if no entry or invalid entry in
// the L1, still need to bypass. The LoadBypassEvict Event handles
// both cases in its transitions below, so call LoadBypassEvict for
// both.
if ((in_msg.isGLCSet || in_msg.isSLCSet)) {
trigger(Event:LoadBypassEvict, in_msg.LineAddress, cache_entry, tbe);
}
else {
trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
}
} else if (in_msg.Type == RubyRequestType:ATOMIC ||
in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:ST) {
if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
// Write requests with GLC or SLC bit set, or when L1 is disabled,
// should not cache in the L1. They need to perform a store through
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
}
} // end if (disableL1)
} else if (in_msg.Type == RubyRequestType:FLUSH) {
trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:REPLACEMENT){
trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:InvL2){
trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
} else {
error("Unexpected Request Message from VIC");
}
}
}
}
// Actions
action(ic_invCache, "ic", desc="invalidate cache") {
if(is_valid(cache_entry)) {
cache_entry.writeMask.clear();
L1cache.deallocate(address);
}
unset_cache_entry();
}
action(n_issueRdBlk, "n", desc="Issue RdBlk") {
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceRequestType:RdBlk;
out_msg.Requestor := machineID;
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := curCycle();
peek(mandatoryQueue_in, RubyRequest) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
action(rb_bypassDone, "rb", desc="bypass L1 of read access") {
peek(responseToTCP_in, ResponseMsg) {
DataBlock tmp:= in_msg.DataBlk;
if (use_seq_not_coal) {
sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
} else {
coalescer.readCallback(address, MachineType:L1Cache, tmp);
}
if(is_valid(cache_entry)) {
unset_cache_entry();
}
}
}
action(wab_bypassDone, "wab", desc="bypass L1 of write access") {
peek(responseToTCP_in, ResponseMsg) {
DataBlock tmp := in_msg.DataBlk;
if (use_seq_not_coal) {
sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
} else {
coalescer.writeCallback(address, MachineType:L1Cache, tmp);
}
}
}
action(norl_issueRdBlkOrloadDone, "norl", desc="local load done") {
peek(mandatoryQueue_in, RubyRequest){
if (cache_entry.writeMask.containsMask(in_msg.writeMask)) {
if (use_seq_not_coal) {
sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
} else {
coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
} else {
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceRequestType:RdBlk;
out_msg.Requestor := machineID;
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := curCycle();
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
}
action(wt_writeThrough, "wt", desc="Flush dirty data") {
WTcnt := WTcnt + 1;
APPEND_TRANSITION_COMMENT("write++ = ");
APPEND_TRANSITION_COMMENT(WTcnt);
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Requestor := machineID;
assert(is_valid(cache_entry));
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.writeMask.clear();
out_msg.writeMask.orMask(cache_entry.writeMask);
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Data;
out_msg.Type := CoherenceRequestType:WriteThrough;
out_msg.InitialRequestTime := curCycle();
out_msg.Shared := false;
// forward inst sequence number to lower TCC
peek(mandatoryQueue_in, RubyRequest) {
out_msg.instSeqNum := in_msg.instSeqNum;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
action(at_atomicThrough, "at", desc="send Atomic") {
peek(mandatoryQueue_in, RubyRequest) {
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Requestor := machineID;
out_msg.writeMask.clear();
out_msg.writeMask.orMask(in_msg.writeMask);
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Data;
out_msg.InitialRequestTime := curCycle();
out_msg.Shared := false;
peek(mandatoryQueue_in, RubyRequest) {
if (in_msg.Type == RubyRequestType:ATOMIC_RETURN) {
out_msg.Type := CoherenceRequestType:AtomicReturn;
} else {
assert(in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN);
out_msg.Type := CoherenceRequestType:AtomicNoReturn;
}
out_msg.instSeqNum := in_msg.instSeqNum;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
}
action(a_allocate, "a", desc="allocate block") {
if (is_invalid(cache_entry)) {
set_cache_entry(L1cache.allocate(address, new Entry));
}
cache_entry.writeMask.clear();
}
action(t_allocateTBE, "t", desc="allocate TBE Entry") {
check_allocate(TBEs);
TBEs.allocate(address);
set_tbe(TBEs.lookup(address));
// pass GLC/SLC information along
if (mandatoryQueue_in.isReady(clockEdge())) {
peek(mandatoryQueue_in, RubyRequest) {
DPRINTF(RubySlicc, "Address: %p\n", address);
tbe.isGLCSet := in_msg.isGLCSet;
tbe.isSLCSet := in_msg.isSLCSet;
}
}
}
action(d_deallocateTBE, "d", desc="Deallocate TBE") {
TBEs.deallocate(address);
unset_tbe();
}
action(sf_setFlush, "sf", desc="set flush") {
inFlush := true;
APPEND_TRANSITION_COMMENT(" inFlush is true");
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Requestor := machineID;
assert(is_valid(cache_entry));
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.writeMask.clear();
out_msg.writeMask.orMask(cache_entry.writeMask);
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Data;
out_msg.Type := CoherenceRequestType:WriteFlush;
out_msg.InitialRequestTime := curCycle();
out_msg.Shared := false;
out_msg.isSLCSet := false;
peek(mandatoryQueue_in, RubyRequest) {
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
mandatoryQueue_in.dequeue(clockEdge());
}
action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
responseToTCP_in.dequeue(clockEdge());
}
action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
stall_and_wait(mandatoryQueue_in, address);
}
action(l_loadDoneHit, "ldh", desc="local load done (hits in TCP)") {
assert(is_valid(cache_entry));
if (use_seq_not_coal) {
sequencer.readCallback(address, cache_entry.DataBlk, true, MachineType:L1Cache);
} else {
coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
}
action(l_loadDoneMiss, "ldm", desc="local load done (misses in TCP)") {
assert(is_valid(cache_entry));
if (use_seq_not_coal) {
sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
} else {
coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
}
action(ldmi_loadDoneMissInv, "ldmi",
desc="local load done (misses in TCP and line was evicted)") {
// since line was evicted, can't rely on data from cache entry, so use from
// the response message
peek(responseToTCP_in, ResponseMsg) {
DataBlock tmp:= in_msg.DataBlk;
if (use_seq_not_coal) {
sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
} else {
coalescer.readCallback(address, MachineType:L1Cache, tmp);
}
}
}
action(ad_atomicDone, "ad", desc="atomic done") {
assert(is_valid(cache_entry));
coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
action(s_storeDoneHit, "sdh", desc="local store done (hits in TCP)") {
assert(is_valid(cache_entry));
if (use_seq_not_coal) {
sequencer.writeCallback(address, cache_entry.DataBlk, true, MachineType:L1Cache);
} else {
coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
cache_entry.Dirty := true;
}
action(s_storeDoneMiss, "sdm", desc="local store done (misses in TCP)") {
assert(is_valid(cache_entry));
if (use_seq_not_coal) {
sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
} else {
coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
cache_entry.Dirty := true;
}
action(f_flushDone, "f", desc="flush done") {
assert(is_valid(cache_entry));
if (use_seq_not_coal) {
sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
} else {
coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
}
action(inv_invDone, "inv", desc="local inv done") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
assert(false);
} else {
coalescer.invTCPCallback(address);
}
}
action(il2_invL2, "il2", desc="Invalidate address in L2") {
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceRequestType:InvCache;
out_msg.Requestor := machineID;
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := curCycle();
peek(mandatoryQueue_in, RubyRequest) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
assert(false);
} else {
coalescer.invTCCCallback(address);
}
}
action(wd_wtDone, "wd", desc="writethrough done") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
assert(false);
} else {
peek(responseToTCP_in, ResponseMsg) {
coalescer.writeCompleteCallback(address, in_msg.instSeqNum);
}
}
}
action(dw_dirtyWrite, "dw", desc="update write mask"){
peek(mandatoryQueue_in, RubyRequest) {
cache_entry.DataBlk.copyPartial(in_msg.WTData,in_msg.writeMask);
cache_entry.writeMask.orMask(in_msg.writeMask);
}
}
action(w_writeCache, "w", desc="write data to cache") {
peek(responseToTCP_in, ResponseMsg) {
assert(is_valid(cache_entry));
DataBlock tmp := in_msg.DataBlk;
tmp.copyPartial(cache_entry.DataBlk,cache_entry.writeMask);
cache_entry.DataBlk := tmp;
}
}
action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
L1cache.setMRU(address);
}
action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") {
wakeUpAllBuffers(address);
}
// action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
// mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
// }
action(z_stall, "z", desc="stall; built-in") {
// built-int action
}
// added for profiling
action(uu_profileDataMiss, "\udm", desc="Profile the demand miss"){
L1cache.profileDemandMiss();
}
action(uu_profileDataHit, "\udh", desc="Profile the demand hit"){
L1cache.profileDemandHit();
}
// Transitions
// ArrayRead/Write assumptions:
// All requests read Tag Array
// TBE allocation write the TagArray to I
// TBE only checked on misses
// Stores will also write dirty bits in the tag
// WriteThroughs still need to use cache entry as staging buffer for wavefront
// Stalling transitions do NOT check the tag array...and if they do,
// they can cause a resource stall deadlock!
// if another request arrives for the same cache line that has a pending
// atomic or load, put it on the wakeup buffer instead of z_stall'ing it. By
// doing so we reduce resource contention since they won't try again every cycle
// and will instead only try again once woken up
transition({A, IV}, {Load, LoadBypassEvict, Atomic, Store, StoreThrough, Flush}) {
st_stallAndWaitRequest;
}
// if we have a load that misses, allocate TBE entry and transition to IV
// to prevent subsequent requests to same cache line from also going to TCC
// while this request is pending
transition(I, Load, IV) {TagArrayRead} {
t_allocateTBE;
n_issueRdBlk;
uu_profileDataMiss;
p_popMandatoryQueue;
}
transition(V, Load) {TagArrayRead, DataArrayRead} {
l_loadDoneHit;
mru_updateMRU;
uu_profileDataHit;
p_popMandatoryQueue;
}
// Transition to be called when a load request with GLC or SLC flag set arrives
// at L1. This transition invalidates any existing entry and forwards the
// request to L2.
transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} {
uu_profileDataMiss;
ic_invCache;
n_issueRdBlk;
p_popMandatoryQueue;
}
// Transition to be called when a load request with GLC or SLC flag set arrives
// at L1. Since the entry is invalid, there isn't anything to forward to L2,
// so just issue read.
transition(I, LoadBypassEvict) {TagArrayRead, TagArrayWrite} {
uu_profileDataMiss;
n_issueRdBlk;
p_popMandatoryQueue;
}
transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
t_allocateTBE;
mru_updateMRU;
at_atomicThrough;
p_popMandatoryQueue;
}
transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
a_allocate;
dw_dirtyWrite;
s_storeDoneMiss;
uu_profileDataMiss;
wt_writeThrough;
ic_invCache;
p_popMandatoryQueue;
}
transition(V, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
dw_dirtyWrite;
s_storeDoneHit;
uu_profileDataHit;
wt_writeThrough;
ic_invCache;
p_popMandatoryQueue;
}
// if we got a response for a load where the line is in I, then
// another request must have come in that replaced the line in question in
// the cache. Thus, complete this request without allocating the line, but
// still deallocate TBE and wakeup any dependent addresses.
// (Note: this assumes TCC_AckWB is what stores use)
transition(I, TCC_Ack) {TagArrayRead, TagArrayWrite} {
wada_wakeUpAllDependentsAddr;
// NOTE: Because we invalidated the cache line, the assert in l_loadDoneMiss
// will fail -- unlike atomics that automatically go to I when the line returns
// loads do not automatically go to I. Resolve this by passing data from
// message.
ldmi_loadDoneMissInv;
d_deallocateTBE;
pr_popResponseQueue;
}
// if line is currently in IV, then TCC_Ack is returning the data for a
// pending load, so transition to V, deallocate TBE, and wakeup any dependent
// requests so they will be replayed now that this request has returned.
transition(IV, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
a_allocate;
w_writeCache;
wada_wakeUpAllDependentsAddr;
l_loadDoneMiss;
d_deallocateTBE;
pr_popResponseQueue;
}
// if a bypass request arrives back at the TCP, regardless of whether the line
// is in I (from the bypass request) or IV (from a subsequent non-bypassing
// load), retain the current state and complete the bypassing request.
transition({I, IV}, Bypass) {
rb_bypassDone;
pr_popResponseQueue;
}
transition(A, Bypass, I){
d_deallocateTBE;
wab_bypassDone;
pr_popResponseQueue;
}
transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} {
a_allocate;
w_writeCache;
ad_atomicDone;
ic_invCache;
wada_wakeUpAllDependentsAddr;
d_deallocateTBE;
pr_popResponseQueue;
}
transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} {
w_writeCache;
l_loadDoneHit;
pr_popResponseQueue;
}
transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
ic_invCache;
}
transition({A}, Repl) {TagArrayRead, TagArrayWrite} {
ic_invCache;
}
// if a line with a pending load gets evicted, transition the line to I and
// invalidate it.
transition(IV, Repl, I) {TagArrayRead, TagArrayWrite} {
ic_invCache;
}
transition({V,I}, Flush, F) {TagArrayFlash} {
a_allocate;
sf_setFlush;
p_popMandatoryQueue;
}
transition({I, V}, Evict, I) {TagArrayFlash} {
inv_invDone;
ic_invCache;
p_popMandatoryQueue;
}
transition(A, Evict) {TagArrayFlash} {
inv_invDone;
p_popMandatoryQueue;
}
transition(I, InvL2) {
il2_invL2;
p_popMandatoryQueue;
}
transition(V, InvL2, I) {
ic_invCache
il2_invL2;
p_popMandatoryQueue;
}
transition(I, InvL2Resp) {
i2r_invL2Resp;
pr_popResponseQueue;
}
// if a line is in IV and a TCC_AckWB comes back, we must have had a WT
// store followed by a load. Thus, complete the store without affecting
// TBE or line state.
// TCC_AckWB only snoops TBE
transition({V, I, IV, A}, TCC_AckWB) {
wd_wtDone;
pr_popResponseQueue;
}
transition(F, TCC_AckWB, I) {
f_flushDone;
pr_popResponseQueue;
ic_invCache;
}
}