gem5/src/mem/ruby/protocol/GPU_VIPER-TCP.sm

/*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * Copyright (c) 2023 Matthew D. Sinclair
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Blake Hechtman
 */

machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
 : VIPERCoalescer* coalescer;
   Sequencer* sequencer;
   bool use_seq_not_coal;
   CacheMemory * L1cache;
   bool WB; /*is this cache Writeback?*/
   bool disableL1; /* bypass L1 cache? */
   int TCC_select_num_bits;
   Cycles issue_latency := 40;  // time to send data down to TCC
   Cycles l2_hit_latency := 18;

  MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
  MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
  MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";

  MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
  MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
  MessageBuffer * mandatoryQueue;

{
  state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
    I, AccessPermission:Invalid,   desc="Invalid";
    // Note: currently IV in the TCP is only for pending loads to a given cache
    // line. Since the TCP is write through, stores should be allowed to pass
    // through without requiring them to wait.
    IV, AccessPermission:Invalid,  desc="Going from I to V, waiting on TCC data";
    V, AccessPermission:Read_Only, desc="Valid";
    A, AccessPermission:Invalid,   desc="Waiting on Atomic";

    F, AccessPermission:Invalid,   desc="Flushing; Waiting for Ack";
  }

  enumeration(Event, desc="TCP Events") {
    // Core initiated
    Load,            desc="Load";
    LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache block already allocated";
    Store,           desc="Store to L1 (L1 is dirty)";
    StoreThrough,    desc="Store directly to L2(L1 is clean)";
    Atomic,          desc="Atomic";
    Flush,           desc="Flush if dirty(wbL1 for Store Release)";
    Evict,           desc="Evict if clean(invL1 for Load Acquire)";
    // Mem sys initiated
    Repl,            desc="Replacing block from cache";
    InvL2,           desc="Invalidate to L2";
    InvL2Resp,       desc="Invalidate L2 completed";

    // TCC initiated
    TCC_Ack,         desc="TCC Ack to Core Request";
    TCC_AckWB,       desc="TCC Ack for WB";
    // Disable L1 cache
    Bypass,          desc="Bypass the entire L1 cache";
 }

  enumeration(RequestType,
              desc="To communicate stats from transitions to recordStats") {
    DataArrayRead,    desc="Read the data array";
    DataArrayWrite,   desc="Write the data array";
    TagArrayRead,     desc="Read the data array";
    TagArrayWrite,    desc="Write the data array";
    TagArrayFlash,    desc="Flash clear the data array";
  }


  structure(Entry, desc="...", interface="AbstractCacheEntry") {
    State CacheState,           desc="cache state";
    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
    DataBlock DataBlk,          desc="data for the block";
    bool FromL2, default="false", desc="block just moved from L2";
    WriteMask writeMask, desc="written bytes masks";
  }

  structure(TBE, desc="...") {
    State TBEState,    desc="Transient state";
    DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
    bool Dirty,        desc="Is the data dirty (different than memory)?";
    int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for";
    bool Shared,       desc="Victim hit by shared probe";
    bool isGLCSet,     desc="Bypass L1 Cache";
    bool isSLCSet,     desc="Bypass L1 and L2 Cache";
   }

  structure(TBETable, external="yes") {
    TBE lookup(Addr);
    void allocate(Addr);
    void deallocate(Addr);
    bool isPresent(Addr);
  }

  TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
  int WTcnt, default="0";
  int Fcnt, default="0";
  bool inFlush, default="false";

  void set_cache_entry(AbstractCacheEntry b);
  void unset_cache_entry();
  void set_tbe(TBE b);
  void unset_tbe();
  void wakeUpAllBuffers();
  void wakeUpBuffers(Addr a);
  void wakeUpAllBuffers(Addr a);
  Cycles curCycle();

  // Internal functions
  Tick clockEdge();
  Tick cyclesToTicks(Cycles c);
  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
    return cache_entry;
  }

  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      return tbe.DataBlk;
    } else {
      return getCacheEntry(addr).DataBlk;
    }
  }

  State getState(TBE tbe, Entry cache_entry, Addr addr) {
    if (is_valid(tbe)) {
      return tbe.TBEState;
    } else if (is_valid(cache_entry)) {
      return cache_entry.CacheState;
    }
    return State:I;
  }

  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
    if (is_valid(tbe)) {
      tbe.TBEState := state;
    }

    if (is_valid(cache_entry)) {
      cache_entry.CacheState := state;
    }
  }

  void functionalRead(Addr addr, Packet *pkt) {
    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      testAndRead(addr, tbe.DataBlk, pkt);
    } else {
      functionalMemoryRead(pkt);
    }
  }

  int functionalWrite(Addr addr, Packet *pkt) {
    int num_functional_writes := 0;

    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      num_functional_writes := num_functional_writes +
            testAndWrite(addr, tbe.DataBlk, pkt);
    }

    num_functional_writes := num_functional_writes +
        functionalMemoryWrite(pkt);
    return num_functional_writes;
  }

  AccessPermission getAccessPermission(Addr addr) {
    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      return TCP_State_to_permission(tbe.TBEState);
    }

    Entry cache_entry := getCacheEntry(addr);
    if(is_valid(cache_entry)) {
      return TCP_State_to_permission(cache_entry.CacheState);
    }

    return AccessPermission:NotPresent;
  }

  bool isValid(Addr addr) {
      AccessPermission perm := getAccessPermission(addr);
      if (perm == AccessPermission:NotPresent ||
          perm == AccessPermission:Invalid ||
          perm == AccessPermission:Busy) {
          return false;
      } else {
          return true;
      }
  }

  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
    if (is_valid(cache_entry)) {
      cache_entry.changePermission(TCP_State_to_permission(state));
    }
  }

  void recordRequestType(RequestType request_type, Addr addr) {
    if (request_type == RequestType:DataArrayRead) {
        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
    } else if (request_type == RequestType:DataArrayWrite) {
        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
    } else if (request_type == RequestType:TagArrayRead) {
        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
    } else if (request_type == RequestType:TagArrayFlash) {
        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
    } else if (request_type == RequestType:TagArrayWrite) {
        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
    }
  }

  bool checkResourceAvailable(RequestType request_type, Addr addr) {
    if (request_type == RequestType:DataArrayRead) {
      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
    } else if (request_type == RequestType:DataArrayWrite) {
      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
    } else if (request_type == RequestType:TagArrayRead) {
      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
    } else if (request_type == RequestType:TagArrayWrite) {
      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
    } else if (request_type == RequestType:TagArrayFlash) {
      // FIXME should check once per cache, rather than once per cacheline
      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
    } else {
      error("Invalid RequestType type in checkResourceAvailable");
      return true;
    }
  }

  // Out Ports

  out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);

  // In Ports

  in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
    if (responseToTCP_in.isReady(clockEdge())) {
      peek(responseToTCP_in, ResponseMsg, block_on="addr") {
        Entry cache_entry := getCacheEntry(in_msg.addr);
        TBE tbe := TBEs.lookup(in_msg.addr);
        DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);

        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
          if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
              // If L1 is disabled or requests have GLC or SLC flag set,
              // then, the requests should not cache in the L1. The response
              // from L2/global memory should bypass the cache
              trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
          } else {
            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
              trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
            } else {
              Addr victim := L1cache.cacheProbe(in_msg.addr);
              trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
            }
          }
        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
                     in_msg.Type == CoherenceResponseType:NBSysWBAck) {
            trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
            DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
        } else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
            DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
            trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
        } else {
          error("Unexpected Response Message to Core");
        }
      }
    }
  }

  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
    if (mandatoryQueue_in.isReady(clockEdge())) {
      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
        TBE tbe := TBEs.lookup(in_msg.LineAddress);
        DPRINTF(RubySlicc, "%s\n", in_msg);
        if (in_msg.Type == RubyRequestType:LD) {
          // Read requests with GLC or SLC bit set should not cache in the L1.
          // They need to bypass the L1 and go to the L2.  If an entry exists in
          // the L1, it needs to be evicted, and if no entry or invalid entry in
          // the L1, still need to bypass.  The LoadBypassEvict Event handles
          // both cases in its transitions below, so call LoadBypassEvict for
          // both.
          if ((in_msg.isGLCSet || in_msg.isSLCSet)) {
            trigger(Event:LoadBypassEvict, in_msg.LineAddress, cache_entry, tbe);
          }
          else {
            trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
          }
        } else if (in_msg.Type == RubyRequestType:ATOMIC ||
                   in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
                   in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
          trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
        } else if (in_msg.Type == RubyRequestType:ST) {
          if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
            // Write requests with GLC or SLC bit set, or when L1 is disabled,
            // should not cache in the L1. They need to perform a store through
            trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
          } else {
            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
              trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
            } else {
              Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
              trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
            }
          } // end if (disableL1)
        } else if (in_msg.Type == RubyRequestType:FLUSH) {
            trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
        } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
            trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
        } else if (in_msg.Type == RubyRequestType:InvL2){
            trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
        } else {
          error("Unexpected Request Message from VIC");
        }
      }
    }
  }

  // Actions

  action(ic_invCache, "ic", desc="invalidate cache") {
    if(is_valid(cache_entry)) {
      cache_entry.writeMask.clear();
      L1cache.deallocate(address);
    }
    unset_cache_entry();
  }

  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
      out_msg.addr := address;
      out_msg.Type := CoherenceRequestType:RdBlk;
      out_msg.Requestor := machineID;
      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
                              TCC_select_low_bit, TCC_select_num_bits));
      out_msg.MessageSize := MessageSizeType:Request_Control;
      out_msg.InitialRequestTime := curCycle();
      peek(mandatoryQueue_in, RubyRequest) {
        out_msg.isGLCSet := in_msg.isGLCSet;
        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }

  action(rb_bypassDone, "rb", desc="bypass L1 of read access") {
    peek(responseToTCP_in, ResponseMsg) {
      DataBlock tmp:= in_msg.DataBlk;
      if (use_seq_not_coal) {
        sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
      } else {
        coalescer.readCallback(address, MachineType:L1Cache, tmp);
      }
      if(is_valid(cache_entry)) {
        unset_cache_entry();
      }
    }
  }

  action(wab_bypassDone, "wab", desc="bypass L1 of write access") {
    peek(responseToTCP_in, ResponseMsg) {
      DataBlock tmp := in_msg.DataBlk;
      if (use_seq_not_coal) {
        sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
      } else {
        coalescer.writeCallback(address, MachineType:L1Cache, tmp);
      }
    }
  }

  action(norl_issueRdBlkOrloadDone, "norl", desc="local load done") {
    peek(mandatoryQueue_in, RubyRequest){
      if (cache_entry.writeMask.containsMask(in_msg.writeMask)) {
          if (use_seq_not_coal) {
            sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
          } else {
            coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
          }
      } else {
        enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
          out_msg.addr := address;
          out_msg.Type := CoherenceRequestType:RdBlk;
          out_msg.Requestor := machineID;
          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
                              TCC_select_low_bit, TCC_select_num_bits));
          out_msg.MessageSize := MessageSizeType:Request_Control;
          out_msg.InitialRequestTime := curCycle();
          out_msg.isGLCSet := in_msg.isGLCSet;
          out_msg.isSLCSet := in_msg.isSLCSet;
        }
      }
    }
  }

  action(wt_writeThrough, "wt", desc="Flush dirty data") {
    WTcnt := WTcnt + 1;
    APPEND_TRANSITION_COMMENT("write++ = ");
    APPEND_TRANSITION_COMMENT(WTcnt);
    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
      out_msg.addr := address;
      out_msg.Requestor := machineID;
      assert(is_valid(cache_entry));
      out_msg.DataBlk := cache_entry.DataBlk;
      out_msg.writeMask.clear();
      out_msg.writeMask.orMask(cache_entry.writeMask);
      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
                              TCC_select_low_bit, TCC_select_num_bits));
      out_msg.MessageSize := MessageSizeType:Data;
      out_msg.Type := CoherenceRequestType:WriteThrough;
      out_msg.InitialRequestTime := curCycle();
      out_msg.Shared := false;

      // forward inst sequence number to lower TCC
      peek(mandatoryQueue_in, RubyRequest) {
        out_msg.instSeqNum := in_msg.instSeqNum;
        out_msg.isGLCSet := in_msg.isGLCSet;
        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }

  action(at_atomicThrough, "at", desc="send Atomic") {
    peek(mandatoryQueue_in, RubyRequest) {
      enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
        out_msg.addr := address;
        out_msg.Requestor := machineID;
        out_msg.writeMask.clear();
        out_msg.writeMask.orMask(in_msg.writeMask);
        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
                                TCC_select_low_bit, TCC_select_num_bits));
        out_msg.MessageSize := MessageSizeType:Data;
        out_msg.InitialRequestTime := curCycle();
        out_msg.Shared := false;
        peek(mandatoryQueue_in, RubyRequest) {
          if (in_msg.Type == RubyRequestType:ATOMIC_RETURN) {
            out_msg.Type := CoherenceRequestType:AtomicReturn;
          } else {
            assert(in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN);
            out_msg.Type := CoherenceRequestType:AtomicNoReturn;
          }
          out_msg.instSeqNum := in_msg.instSeqNum;
          out_msg.isGLCSet := in_msg.isGLCSet;
          out_msg.isSLCSet := in_msg.isSLCSet;
        }
      }
    }
  }

  action(a_allocate, "a", desc="allocate block") {
    if (is_invalid(cache_entry)) {
      set_cache_entry(L1cache.allocate(address, new Entry));
    }
    cache_entry.writeMask.clear();
  }

  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
    check_allocate(TBEs);
    TBEs.allocate(address);
    set_tbe(TBEs.lookup(address));

    // pass GLC/SLC information along
    if (mandatoryQueue_in.isReady(clockEdge())) {
      peek(mandatoryQueue_in, RubyRequest) {
        DPRINTF(RubySlicc, "Address: %p\n", address);
        tbe.isGLCSet := in_msg.isGLCSet;
        tbe.isSLCSet := in_msg.isSLCSet;
      }
    }
  }

  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
    TBEs.deallocate(address);
    unset_tbe();
  }

  action(sf_setFlush, "sf", desc="set flush") {
    inFlush := true;
    APPEND_TRANSITION_COMMENT(" inFlush is true");
    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
      out_msg.addr := address;
      out_msg.Requestor := machineID;
      assert(is_valid(cache_entry));
      out_msg.DataBlk := cache_entry.DataBlk;
      out_msg.writeMask.clear();
      out_msg.writeMask.orMask(cache_entry.writeMask);
      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
                              TCC_select_low_bit, TCC_select_num_bits));
      out_msg.MessageSize := MessageSizeType:Data;
      out_msg.Type := CoherenceRequestType:WriteFlush;
      out_msg.InitialRequestTime := curCycle();
      out_msg.Shared := false;
      out_msg.isSLCSet := false;
      peek(mandatoryQueue_in, RubyRequest) {
        out_msg.instSeqNum := in_msg.instSeqNum;
      }
    }
  }

  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
    mandatoryQueue_in.dequeue(clockEdge());
  }

  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
    responseToTCP_in.dequeue(clockEdge());
  }

  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
    stall_and_wait(mandatoryQueue_in, address);
  }

  action(l_loadDoneHit, "ldh", desc="local load done (hits in TCP)") {
    assert(is_valid(cache_entry));
    if (use_seq_not_coal) {
      sequencer.readCallback(address, cache_entry.DataBlk, true, MachineType:L1Cache);
    } else {
      coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
    }
  }

  action(l_loadDoneMiss, "ldm", desc="local load done (misses in TCP)") {
    assert(is_valid(cache_entry));
    if (use_seq_not_coal) {
      sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
    } else {
      coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
    }
  }

  action(ldmi_loadDoneMissInv, "ldmi",
         desc="local load done (misses in TCP and line was evicted)") {
    // since line was evicted, can't rely on data from cache entry, so use from
    // the response message
    peek(responseToTCP_in, ResponseMsg) {
      DataBlock tmp:= in_msg.DataBlk;
      if (use_seq_not_coal) {
        sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
      } else {
        coalescer.readCallback(address, MachineType:L1Cache, tmp);
      }
    }
  }

  action(ad_atomicDone, "ad", desc="atomic done") {
    assert(is_valid(cache_entry));
    coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
  }

  action(s_storeDoneHit, "sdh", desc="local store done (hits in TCP)") {
    assert(is_valid(cache_entry));

    if (use_seq_not_coal) {
      sequencer.writeCallback(address, cache_entry.DataBlk, true, MachineType:L1Cache);
    } else {
      coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
    }
    cache_entry.Dirty := true;
  }

  action(s_storeDoneMiss, "sdm", desc="local store done (misses in TCP)") {
    assert(is_valid(cache_entry));

    if (use_seq_not_coal) {
      sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
    } else {
      coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
    }
    cache_entry.Dirty := true;
  }

  action(f_flushDone, "f", desc="flush done") {
    assert(is_valid(cache_entry));

    if (use_seq_not_coal) {
        sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
    } else {
        coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
    }
  }

  action(inv_invDone, "inv", desc="local inv done") {
    if (use_seq_not_coal) {
        DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
        assert(false);
    } else {
      coalescer.invTCPCallback(address);
    }
  }

  action(il2_invL2, "il2", desc="Invalidate address in L2") {
    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
      out_msg.addr := address;
      out_msg.Type := CoherenceRequestType:InvCache;
      out_msg.Requestor := machineID;
      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
                              TCC_select_low_bit, TCC_select_num_bits));
      out_msg.MessageSize := MessageSizeType:Request_Control;
      out_msg.InitialRequestTime := curCycle();
      peek(mandatoryQueue_in, RubyRequest) {
        out_msg.isGLCSet := in_msg.isGLCSet;
        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }

  action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
    if (use_seq_not_coal) {
        DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
        assert(false);
    } else {
      coalescer.invTCCCallback(address);
    }
  }

  action(wd_wtDone, "wd", desc="writethrough done") {
    if (use_seq_not_coal) {
      DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
      assert(false);
    } else {
      peek(responseToTCP_in, ResponseMsg) {
        coalescer.writeCompleteCallback(address, in_msg.instSeqNum);
      }
    }
  }

  action(dw_dirtyWrite, "dw", desc="update write mask"){
    peek(mandatoryQueue_in, RubyRequest) {
      cache_entry.DataBlk.copyPartial(in_msg.WTData,in_msg.writeMask);
      cache_entry.writeMask.orMask(in_msg.writeMask);
    }
  }
  action(w_writeCache, "w", desc="write data to cache") {
    peek(responseToTCP_in, ResponseMsg) {
      assert(is_valid(cache_entry));
      DataBlock tmp := in_msg.DataBlk;
      tmp.copyPartial(cache_entry.DataBlk,cache_entry.writeMask);
      cache_entry.DataBlk := tmp;
    }
  }

  action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
    L1cache.setMRU(address);
  }

  action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") {
    wakeUpAllBuffers(address);
  }

//  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
//    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
//  }

  action(z_stall, "z", desc="stall; built-in") {
      // built-int action
  }

  // added for profiling
  action(uu_profileDataMiss, "\udm", desc="Profile the demand miss"){
    L1cache.profileDemandMiss();
  }

  action(uu_profileDataHit, "\udh", desc="Profile the demand hit"){
    L1cache.profileDemandHit();
  }

  // Transitions
  // ArrayRead/Write assumptions:
  // All requests read Tag Array
  // TBE allocation write the TagArray to I
  // TBE only checked on misses
  // Stores will also write dirty bits in the tag
  // WriteThroughs still need to use cache entry as staging buffer for wavefront

  // Stalling transitions do NOT check the tag array...and if they do,
  // they can cause a resource stall deadlock!

  // if another request arrives for the same cache line that has a pending
  // atomic or load, put it on the wakeup buffer instead of z_stall'ing it.  By
  // doing so we reduce resource contention since they won't try again every cycle
  // and will instead only try again once woken up
  transition({A, IV}, {Load, LoadBypassEvict, Atomic, Store, StoreThrough, Flush}) {
      st_stallAndWaitRequest;
  }

  // if we have a load that misses, allocate TBE entry and transition to IV
  // to prevent subsequent requests to same cache line from also going to TCC
  // while this request is pending
  transition(I, Load, IV) {TagArrayRead} {
    t_allocateTBE;
    n_issueRdBlk;
    uu_profileDataMiss;
    p_popMandatoryQueue;
  }

  transition(V, Load) {TagArrayRead, DataArrayRead} {
    l_loadDoneHit;
    mru_updateMRU;
    uu_profileDataHit;
    p_popMandatoryQueue;
  }

  // Transition to be called when a load request with GLC or SLC flag set arrives
  // at L1. This transition invalidates any existing entry and forwards the
  // request to L2.
  transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} {
    uu_profileDataMiss;
    ic_invCache;
    n_issueRdBlk;
    p_popMandatoryQueue;
  }

  // Transition to be called when a load request with GLC or SLC flag set arrives
  // at L1. Since the entry is invalid, there isn't anything to forward to L2,
  // so just issue read.
  transition(I, LoadBypassEvict) {TagArrayRead, TagArrayWrite} {
    uu_profileDataMiss;
    n_issueRdBlk;
    p_popMandatoryQueue;
  }

  transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
    t_allocateTBE;
    mru_updateMRU;
    at_atomicThrough;
    p_popMandatoryQueue;
  }

  transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    a_allocate;
    dw_dirtyWrite;
    s_storeDoneMiss;
    uu_profileDataMiss;
    wt_writeThrough;
    ic_invCache;
    p_popMandatoryQueue;
  }

  transition(V, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    dw_dirtyWrite;
    s_storeDoneHit;
    uu_profileDataHit;
    wt_writeThrough;
    ic_invCache;
    p_popMandatoryQueue;
  }

  // if we got a response for a load where the line is in I, then
  // another request must have come in that replaced the line in question in
  // the cache.  Thus, complete this request without allocating the line, but
  // still deallocate TBE and wakeup any dependent addresses.
  // (Note: this assumes TCC_AckWB is what stores use)
  transition(I, TCC_Ack) {TagArrayRead, TagArrayWrite} {
    wada_wakeUpAllDependentsAddr;
    // NOTE: Because we invalidated the cache line, the assert in l_loadDoneMiss
    // will fail -- unlike atomics that automatically go to I when the line returns
    // loads do not automatically go to I.  Resolve this by passing data from
    // message.
    ldmi_loadDoneMissInv;
    d_deallocateTBE;
    pr_popResponseQueue;
  }

  // if line is currently in IV, then TCC_Ack is returning the data for a
  // pending load, so transition to V, deallocate TBE, and wakeup any dependent
  // requests so they will be replayed now that this request has returned.
  transition(IV, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
    a_allocate;
    w_writeCache;
    wada_wakeUpAllDependentsAddr;
    l_loadDoneMiss;
    d_deallocateTBE;
    pr_popResponseQueue;
  }

  // if a bypass request arrives back at the TCP, regardless of whether the line
  // is in I (from the bypass request) or IV (from a subsequent non-bypassing
  // load), retain the current state and complete the bypassing request.
  transition({I, IV}, Bypass) {
    rb_bypassDone;
    pr_popResponseQueue;
  }

  transition(A, Bypass, I){
    d_deallocateTBE;
    wab_bypassDone;
    pr_popResponseQueue;
  }

  transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} {
    a_allocate;
    w_writeCache;
    ad_atomicDone;
    ic_invCache;
    wada_wakeUpAllDependentsAddr;
    d_deallocateTBE;
    pr_popResponseQueue;
  }

  transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} {
    w_writeCache;
    l_loadDoneHit;
    pr_popResponseQueue;
  }

  transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
    ic_invCache;
  }

  transition({A}, Repl) {TagArrayRead, TagArrayWrite} {
    ic_invCache;
  }

  // if a line with a pending load gets evicted, transition the line to I and
  // invalidate it.
  transition(IV, Repl, I) {TagArrayRead, TagArrayWrite} {
    ic_invCache;
  }

  transition({V,I}, Flush, F) {TagArrayFlash} {
    a_allocate;
    sf_setFlush;
    p_popMandatoryQueue;
  }

  transition({I, V}, Evict, I) {TagArrayFlash} {
    inv_invDone;
    ic_invCache;
    p_popMandatoryQueue;
  }

  transition(A, Evict) {TagArrayFlash} {
    inv_invDone;
    p_popMandatoryQueue;
  }

  transition(I, InvL2) {
    il2_invL2;
    p_popMandatoryQueue;
  }

  transition(V, InvL2, I) {
    ic_invCache
    il2_invL2;
    p_popMandatoryQueue;
  }

  transition(I, InvL2Resp) {
    i2r_invL2Resp;
    pr_popResponseQueue;
  }

  // if a line is in IV and a TCC_AckWB comes back, we must have had a WT
  // store followed by a load. Thus, complete the store without affecting
  // TBE or line state.
  // TCC_AckWB only snoops TBE
  transition({V, I, IV, A}, TCC_AckWB) {
    wd_wtDone;
    pr_popResponseQueue;
  }

  transition(F, TCC_AckWB, I) {
    f_flushDone;
    pr_popResponseQueue;
    ic_invCache;
  }
}