gem5/src/mem/ruby/protocol/GPU_VIPER-TCC.sm

/*
 * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Blake Hechtman
 */

machine(MachineType:TCC, "TCC Cache")
 : CacheMemory * L2cache;
   bool WB; /*is this cache Writeback?*/
   Cycles l2_request_latency := 50;
   Cycles l2_response_latency := 20;
   Cycles glc_atomic_latency := 0;

  // From the TCPs or SQCs
  MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request";
  // To the Cores. TCC deals only with TCPs/SQCs.
  MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response";
  // From the NB
  MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request";
  MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response";
  // To the NB
  MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request";
  MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response";
  MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock";

  MessageBuffer * triggerQueue;

{
  // EVENTS
  enumeration(Event, desc="TCC Events") {
    // Requests coming from the Cores
    RdBlk,                  desc="RdBlk event";
    RdBypassEvict,          desc="Bypass L2 on reads. Evict if cache block already allocated";
    WrVicBlk,               desc="L1 Write Through";
    WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
    WrVicBlkEvict,          desc="L1 Write Through(dirty cache) and evict";
    Atomic,                 desc="Atomic Op";
    AtomicPassOn,           desc="Atomic Op Passed on to Directory";
    AtomicDone,             desc="AtomicOps Complete";
    AtomicNotDone,          desc="AtomicOps not Complete";
    Data,                   desc="data messgae";
    // Coming from this TCC
    L2_Repl,                desc="L2 Replacement";
    // Probes
    PrbInv,                 desc="Invalidating probe";
    // Coming from Memory Controller
    WBAck,                  desc="writethrough ack from memory";
    Bypass,                 desc="Bypass the entire L2 cache";
  }

  // STATES
  state_declaration(State, desc="TCC State", default="TCC_State_I") {
    M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
    W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
    V, AccessPermission:Read_Only,  desc="Valid";
    I, AccessPermission:Invalid,    desc="Invalid";
    IV, AccessPermission:Busy,      desc="Waiting for Data";
    WI, AccessPermission:Busy,      desc="Waiting on Writethrough Ack";
    WIB, AccessPermission:Busy,     desc="Waiting on Writethrough Ack; Will be Bypassed";
    A, AccessPermission:Busy,       desc="Invalid waiting on atomici Data";
  }

  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
    DataArrayRead,    desc="Read the data array";
    DataArrayWrite,   desc="Write the data array";
    TagArrayRead,     desc="Read the data array";
    TagArrayWrite,    desc="Write the data array";
  }


  // STRUCTURES

  structure(Entry, desc="...", interface="AbstractCacheEntry") {
    State CacheState,           desc="cache state";
    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
    DataBlock DataBlk,          desc="Data for the block";
    WriteMask writeMask,        desc="Dirty byte mask";
  }

  structure(TBE, desc="...") {
    State TBEState,     desc="Transient state";
    DataBlock DataBlk,  desc="data for the block";
    bool Dirty,         desc="Is the data dirty?";
    bool Shared,        desc="Victim hit by shared probe";
    MachineID From,     desc="Waiting for writeback from...";
    NetDest Destination, desc="Data destination";
    int numAtomics,     desc="number remaining atomics";
    int atomicDoneCnt,  desc="number AtomicDones triggered";
    bool isGLCSet,      desc="Bypass L1 Cache";
    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
  }

  structure(TBETable, external="yes") {
    TBE lookup(Addr);
    void allocate(Addr);
    void deallocate(Addr);
    bool isPresent(Addr);
  }

  TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";

  void set_cache_entry(AbstractCacheEntry b);
  void unset_cache_entry();
  void set_tbe(TBE b);
  void unset_tbe();
  void wakeUpAllBuffers();
  void wakeUpBuffers(Addr a);
  void wakeUpAllBuffers(Addr a);

  MachineID mapAddressToMachine(Addr addr, MachineType mtype);

  // FUNCTION DEFINITIONS
  Tick clockEdge();

  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
    return static_cast(Entry, "pointer", L2cache.lookup(addr));
  }

  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
    return getCacheEntry(addr).DataBlk;
  }

  bool presentOrAvail(Addr addr) {
    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
  }

  State getState(TBE tbe, Entry cache_entry, Addr addr) {
    if (is_valid(tbe)) {
      return tbe.TBEState;
    } else if (is_valid(cache_entry)) {
      return cache_entry.CacheState;
    }
    return State:I;
  }

  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
    if (is_valid(tbe)) {
        tbe.TBEState := state;
    }

    if (is_valid(cache_entry)) {
        cache_entry.CacheState := state;
    }
  }

  void functionalRead(Addr addr, Packet *pkt) {
    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      testAndRead(addr, tbe.DataBlk, pkt);
    } else {
      functionalMemoryRead(pkt);
    }
  }

  int functionalWrite(Addr addr, Packet *pkt) {
    int num_functional_writes := 0;
    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      num_functional_writes := num_functional_writes +
            testAndWrite(addr, tbe.DataBlk, pkt);
    }

    num_functional_writes := num_functional_writes +
        functionalMemoryWrite(pkt);
    return num_functional_writes;
  }

  AccessPermission getAccessPermission(Addr addr) {
    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      return TCC_State_to_permission(tbe.TBEState);
    }

    Entry cache_entry := getCacheEntry(addr);
    if(is_valid(cache_entry)) {
      return TCC_State_to_permission(cache_entry.CacheState);
    }

    return AccessPermission:NotPresent;
  }

  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
    if (is_valid(cache_entry)) {
      cache_entry.changePermission(TCC_State_to_permission(state));
    }
  }

  void recordRequestType(RequestType request_type, Addr addr) {
    if (request_type == RequestType:DataArrayRead) {
        L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
    } else if (request_type == RequestType:DataArrayWrite) {
        L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
    } else if (request_type == RequestType:TagArrayRead) {
        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
    } else if (request_type == RequestType:TagArrayWrite) {
        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
    }
  }

  bool checkResourceAvailable(RequestType request_type, Addr addr) {
    if (request_type == RequestType:DataArrayRead) {
      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
    } else if (request_type == RequestType:DataArrayWrite) {
      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
    } else if (request_type == RequestType:TagArrayRead) {
      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
    } else if (request_type == RequestType:TagArrayWrite) {
      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
    } else {
      error("Invalid RequestType type in checkResourceAvailable");
      return true;
    }
  }


  // ** OUT_PORTS **

  // Three classes of ports
  // Class 1: downward facing network links to NB
  out_port(requestToNB_out, CPURequestMsg, requestToNB);
  out_port(responseToNB_out, ResponseMsg, responseToNB);
  out_port(unblockToNB_out, UnblockMsg, unblockToNB);

  // Class 2: upward facing ports to GPU cores
  out_port(responseToCore_out, ResponseMsg, responseToCore);

  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
  //
  // request queue going to NB
  //


// ** IN_PORTS **
  in_port(triggerQueue_in, TriggerMsg, triggerQueue) {
    if (triggerQueue_in.isReady(clockEdge())) {
      peek(triggerQueue_in, TriggerMsg) {
        TBE tbe := TBEs.lookup(in_msg.addr);
        Entry cache_entry := getCacheEntry(in_msg.addr);

        // There is a possible race where multiple AtomicDone triggers can be
        // sent if another Atomic to the same address is issued after the
        // AtomicDone is triggered but before the message arrives here. For
        // that case we count the number of AtomicDones in flight for this
        // address and only call AtomicDone to deallocate the TBE when it is
        // the last in flight message.
        if (tbe.numAtomics == 0 && tbe.atomicDoneCnt == 1) {
            trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
        } else {
            trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
        }
      }
    }
  }


  in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
    if (responseFromNB_in.isReady(clockEdge())) {
      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
        TBE tbe := TBEs.lookup(in_msg.addr);
        Entry cache_entry := getCacheEntry(in_msg.addr);
        bool is_slc_set := false;

        if (!is_invalid(tbe)) {
            is_slc_set := tbe.isSLCSet;
        }

        // Whether the SLC bit is set or not, WB acks should invoke the
        // WBAck event. For cases where a read response will follow a
        // WBAck (A read bypass evict on a dirty line), the line's TLB
        // will not be deallocated on WBAck, and the SLC bit will be
        // checked when the read response is received.
        if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
        } else if (is_slc_set) {
            // If the SLC bit is set, the response needs to bypass the cache
            // and should not be allocated an entry.
            trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
        } else if (in_msg.Type == CoherenceResponseType:NBSysResp) {
          if(presentOrAvail(in_msg.addr)) {
            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
          } else {
            Addr victim :=  L2cache.cacheProbe(in_msg.addr);
            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
          }
        } else {
          error("Unexpected Response Message to Core");
        }
      }
    }
  }

  // Finally handling incoming requests (from TCP) and probes (from NB).
  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
    if (probeNetwork_in.isReady(clockEdge())) {
      peek(probeNetwork_in, NBProbeRequestMsg) {
        DPRINTF(RubySlicc, "%s\n", in_msg);
        Entry cache_entry := getCacheEntry(in_msg.addr);
        TBE tbe := TBEs.lookup(in_msg.addr);
        trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
      }
    }
  }

  in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
    if (coreRequestNetwork_in.isReady(clockEdge())) {
      peek(coreRequestNetwork_in, CPURequestMsg) {
        TBE tbe := TBEs.lookup(in_msg.addr);
        Entry cache_entry := getCacheEntry(in_msg.addr);
        if (in_msg.Type == CoherenceRequestType:WriteThrough) {
            if (in_msg.isSLCSet) {
                // The request should bypass the cache if SLC bit is set.
                // If the cache entry exists already, then evict it.
                // Else, perform a normal cache access.
                // The cache entry is allocated only on response and bypass is
                // handled there
                if(presentOrAvail(in_msg.addr)) {
                    trigger(Event:WrVicBlkEvict, in_msg.addr, cache_entry, tbe);
                } else {
                    trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
                }
            } else if(WB) {
                if(presentOrAvail(in_msg.addr)) {
                    trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
                } else {
                    Addr victim :=  L2cache.cacheProbe(in_msg.addr);
                    trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
                }
            } else {
                trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
            }
        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
          // If the request is system-level, or if the address isn't in the cache,
          // then send the request to the directory. Since non-SLC atomics won't be
          // performed by the directory, TCC will perform the atomic on the return path
          // on Event:Data.
          // The action will invalidate the cache line if SLC is set and the address is
          // in the cache.
          if(in_msg.isSLCSet || !presentOrAvail(in_msg.addr)) {
            trigger(Event:AtomicPassOn, in_msg.addr, cache_entry, tbe);
          } else {
            trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
          }
        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
          if (in_msg.isSLCSet) {
            // If SLC bit is set, the request needs to go directly to memory.
            // If a cache block already exists, then evict it.
            trigger(Event:RdBypassEvict, in_msg.addr, cache_entry, tbe);
          } else {
            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
          }
        } else {
          DPRINTF(RubySlicc, "%s\n", in_msg);
          error("Unexpected Response Message to Core");
        }
      }
    }
  }
  // BEGIN ACTIONS

  action(i_invL2, "i", desc="invalidate TCC cache block") {
    if (is_valid(cache_entry)) {
        L2cache.deallocate(address);
    }
    unset_cache_entry();
  }

  action(sd_sendData, "sd", desc="send Shared response") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
        out_msg.addr := address;
        out_msg.Type := CoherenceResponseType:TDSysResp;
        out_msg.Sender := machineID;
        out_msg.Destination.add(in_msg.Requestor);
        out_msg.DataBlk := cache_entry.DataBlk;
        out_msg.MessageSize := MessageSizeType:Response_Data;
        out_msg.Dirty := false;
        out_msg.State := CoherenceState:Shared;
        out_msg.isGLCSet := in_msg.isGLCSet;
        out_msg.isSLCSet := in_msg.isSLCSet;
        DPRINTF(RubySlicc, "%s\n", out_msg);
      }
    }
  }


  action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
    enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
      out_msg.addr := address;
      out_msg.Type := CoherenceResponseType:TDSysResp;
      out_msg.Sender := machineID;
      out_msg.Destination := tbe.Destination;
      out_msg.DataBlk := cache_entry.DataBlk;
      out_msg.MessageSize := MessageSizeType:Response_Data;
      out_msg.Dirty := false;
      out_msg.State := CoherenceState:Shared;
      DPRINTF(RubySlicc, "%s\n", out_msg);
      peek(responseFromNB_in, ResponseMsg) {
        out_msg.isGLCSet := tbe.isGLCSet;
        out_msg.isSLCSet := tbe.isSLCSet;
      }
    }
    enqueue(unblockToNB_out, UnblockMsg, 1) {
      out_msg.addr := address;
      out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
      out_msg.MessageSize := MessageSizeType:Unblock_Control;
      peek(responseFromNB_in, ResponseMsg) {
        out_msg.isGLCSet := tbe.isGLCSet;
        out_msg.isSLCSet := tbe.isSLCSet;
      }
      DPRINTF(RubySlicc, "%s\n", out_msg);
    }
  }

  action(rb_bypassDone, "rb", desc="bypass L2 of read access") {
    peek(responseFromNB_in, ResponseMsg) {
        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
          out_msg.addr := address;
          out_msg.Type := CoherenceResponseType:TDSysResp;
          out_msg.Sender := machineID;
          out_msg.Destination := tbe.Destination;
          out_msg.DataBlk := in_msg.DataBlk;
          out_msg.MessageSize := MessageSizeType:Response_Data;
          out_msg.Dirty := false;
          out_msg.State := CoherenceState:Shared;
          out_msg.isGLCSet := tbe.isGLCSet;
          out_msg.isSLCSet := tbe.isSLCSet;
          DPRINTF(RubySlicc, "%s\n", out_msg);
        }
        enqueue(unblockToNB_out, UnblockMsg, 1) {
          out_msg.addr := address;
          out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
          out_msg.MessageSize := MessageSizeType:Unblock_Control;
          DPRINTF(RubySlicc, "%s\n", out_msg);
        }
    }
  }

  action(rd_requestData, "r", desc="Miss in L2, pass on") {
    if(tbe.Destination.count()==1){
      peek(coreRequestNetwork_in, CPURequestMsg) {
        enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
          out_msg.addr := address;
          out_msg.Type := in_msg.Type;
          out_msg.Requestor := machineID;
          out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
          out_msg.Shared := false; // unneeded for this request
          out_msg.MessageSize := in_msg.MessageSize;
          out_msg.isGLCSet := tbe.isGLCSet;
          out_msg.isSLCSet := tbe.isSLCSet;
          DPRINTF(RubySlicc, "%s\n", out_msg);
        }
      }
    }
  }

  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
    peek(responseFromNB_in, ResponseMsg) {
      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
        out_msg.addr := address;
        out_msg.Type := CoherenceResponseType:TDSysWBAck;
        out_msg.Destination.clear();
        out_msg.Destination.add(in_msg.WTRequestor);
        out_msg.Sender := machineID;
        out_msg.MessageSize := MessageSizeType:Writeback_Control;
        out_msg.instSeqNum := in_msg.instSeqNum;
      }
    }
  }

  action(swb_sendWBAck, "swb", desc="send WB Ack") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
        out_msg.addr := address;
        out_msg.Type := CoherenceResponseType:TDSysWBAck;
        out_msg.Destination.clear();
        out_msg.Destination.add(in_msg.Requestor);
        out_msg.Sender := machineID;
        out_msg.MessageSize := MessageSizeType:Writeback_Control;
        out_msg.instSeqNum := in_msg.instSeqNum;
      }
    }
  }

  action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
        enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) {
          out_msg.addr := address;
          out_msg.Type := CoherenceResponseType:TDSysResp;
          out_msg.Destination.clear();
          out_msg.Destination.add(in_msg.Requestor);
          out_msg.Sender := machineID;
          out_msg.MessageSize := MessageSizeType:Response_Data;
          out_msg.DataBlk := cache_entry.DataBlk;
          out_msg.isGLCSet := in_msg.isGLCSet;
          out_msg.isSLCSet := in_msg.isSLCSet;
        }
    }
    cache_entry.DataBlk.clearAtomicLogEntries();
  }

  action(bar_sendBypassedAtomicResponse, "bar", desc="send bypassed Atomic Ack") {
    peek(responseFromNB_in, ResponseMsg) {
        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
          out_msg.addr := address;
          out_msg.Type := CoherenceResponseType:TDSysResp;
          out_msg.Destination.add(in_msg.WTRequestor);
          out_msg.Sender := machineID;
          out_msg.MessageSize := in_msg.MessageSize;
          out_msg.DataBlk := in_msg.DataBlk;
          out_msg.isGLCSet := tbe.isGLCSet;
          out_msg.isSLCSet := tbe.isSLCSet;
        }
    }
  }

  action(a_allocateBlock, "a", desc="allocate TCC block") {
    if (is_invalid(cache_entry)) {
      set_cache_entry(L2cache.allocate(address, new Entry));
      cache_entry.writeMask.clear();
    }
  }

  action(p_profileMiss, "pm", desc="Profile cache miss") {
      L2cache.profileDemandMiss();
  }

  action(p_profileHit, "ph", desc="Profile cache hit") {
      L2cache.profileDemandHit();
  }

  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
    if (is_invalid(tbe)) {
      check_allocate(TBEs);
      TBEs.allocate(address);
      set_tbe(TBEs.lookup(address));
      tbe.Destination.clear();
      tbe.numAtomics := 0;
      tbe.atomicDoneCnt := 0;
    }
    if (coreRequestNetwork_in.isReady(clockEdge())) {
      peek(coreRequestNetwork_in, CPURequestMsg) {
        if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
          tbe.Destination.add(in_msg.Requestor);
        }
        tbe.isGLCSet := in_msg.isGLCSet;
        tbe.isSLCSet := in_msg.isSLCSet;
      }
    }
  }

  action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
    tbe.Destination.clear();
    TBEs.deallocate(address);
    unset_tbe();
  }

  action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
    peek(responseFromNB_in, ResponseMsg) {
      cache_entry.DataBlk := in_msg.DataBlk;
      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
    }
  }

  action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
      cache_entry.writeMask.orMask(in_msg.writeMask);
      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
    }
  }

  action(wt_writeThrough, "wt", desc="write back data") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
        out_msg.addr := address;
        out_msg.Requestor := machineID;
        out_msg.WTRequestor := in_msg.Requestor;
        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
        out_msg.MessageSize := MessageSizeType:Data;
        out_msg.Type := CoherenceRequestType:WriteThrough;
        out_msg.Dirty := true;
        out_msg.DataBlk := in_msg.DataBlk;
        out_msg.writeMask.orMask(in_msg.writeMask);
        out_msg.instSeqNum := in_msg.instSeqNum;
      }
    }
  }

  action(wb_writeBack, "wb", desc="write back data") {
    enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
      out_msg.addr := address;
      out_msg.Requestor := machineID;
      out_msg.WTRequestor := machineID;
      out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
      out_msg.MessageSize := MessageSizeType:Data;
      out_msg.Type := CoherenceRequestType:WriteThrough;
      out_msg.Dirty := true;
      out_msg.DataBlk := cache_entry.DataBlk;
      out_msg.writeMask.orMask(cache_entry.writeMask);
    }
  }

  action(at_atomicThrough, "at", desc="write back data") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
        out_msg.addr := address;
        out_msg.Requestor := machineID;
        out_msg.WTRequestor := in_msg.Requestor;
        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
        out_msg.MessageSize := MessageSizeType:Data;
        out_msg.Type := CoherenceRequestType:Atomic;
        out_msg.Dirty := true;
        out_msg.writeMask.orMask(in_msg.writeMask);
        out_msg.isGLCSet := in_msg.isGLCSet;
        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }

  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
    enqueue(responseToNB_out, ResponseMsg, 1) {
      out_msg.addr := address;
      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
      out_msg.Sender := machineID;
      out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
      out_msg.Dirty := false;
      out_msg.Hit := false;
      out_msg.Ntsl := true;
      out_msg.State := CoherenceState:NA;
      out_msg.MessageSize := MessageSizeType:Response_Control;
    }
  }
  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
    L2cache.setMRU(address);
  }

  action(p_popRequestQueue, "p", desc="pop request queue") {
    coreRequestNetwork_in.dequeue(clockEdge());
  }

  action(pr_popResponseQueue, "pr", desc="pop response queue") {
    responseFromNB_in.dequeue(clockEdge());
  }

  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
    probeNetwork_in.dequeue(clockEdge());
  }

  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
    stall_and_wait(coreRequestNetwork_in, address);
  }

  action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") {
    wakeUpAllBuffers(address);
  }

  action(z_stall, "z", desc="stall") {
      // built-in
  }


  action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
    tbe.numAtomics := tbe.numAtomics + 1;
  }


  action(dna_decrementNumAtomics, "dna", desc="inc num atomics") {
    tbe.numAtomics := tbe.numAtomics - 1;
    if (tbe.numAtomics==0) {
      enqueue(triggerQueue_out, TriggerMsg, 1) {
        tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1;
        out_msg.addr := address;
        out_msg.Type := TriggerType:AtomicDone;
        peek(responseFromNB_in, ResponseMsg) {
          out_msg.isGLCSet := tbe.isGLCSet;
          out_msg.isSLCSet := tbe.isSLCSet;
        }
      }
    }
  }

  action(dadc_decrementAtomicDoneCnt, "dadc", desc="decrement atomics done cnt flag") {
    tbe.atomicDoneCnt := tbe.atomicDoneCnt - 1;
  }

  action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
    triggerQueue_in.dequeue(clockEdge());
  }

  action(pa_performAtomic, "pa", desc="Perform atomic") {
    cache_entry.DataBlk.atomicPartial(cache_entry.DataBlk, cache_entry.writeMask);
  }

  // END ACTIONS

  // BEGIN TRANSITIONS
  // transitions from base
  // Assumptions for ArrayRead/Write
  // TBE checked before tags
  // Data Read/Write requires Tag Read

  // Stalling transitions do NOT check the tag array...and if they do,
  // they can cause a resource stall deadlock!

  transition(WI, {RdBlk, WrVicBlk, Atomic, AtomicPassOn, WrVicBlkBack}) { //TagArrayRead} {
      // by putting the stalled requests in a buffer, we reduce resource contention
      // since they won't try again every cycle and will instead only try again once
      // woken up
      st_stallAndWaitRequest;
  }
  transition(WIB, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
      // by putting the stalled requests in a buffer, we reduce resource contention
      // since they won't try again every cycle and will instead only try again once
      // woken up
      st_stallAndWaitRequest;
  }
  transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} {
      // by putting the stalled requests in a buffer, we reduce resource contention
      // since they won't try again every cycle and will instead only try again once
      // woken up
      st_stallAndWaitRequest;
  }

  transition(IV, {WrVicBlk, Atomic, AtomicPassOn, WrVicBlkBack}) { //TagArrayRead} {
      // by putting the stalled requests in a buffer, we reduce resource contention
      // since they won't try again every cycle and will instead only try again once
      // woken up
      st_stallAndWaitRequest;
  }

  transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
    p_profileHit;
    sd_sendData;
    ut_updateTag;
    p_popRequestQueue;
  }

  transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
    t_allocateTBE;
    wb_writeBack;
    // need to try this request again after writing back the current entry -- to
    // do so, put it with other stalled requests in a buffer to reduce resource
    // contention since they won't try again every cycle and will instead only
    // try again once woken up
    st_stallAndWaitRequest;
  }

  transition(I, RdBlk, IV) {TagArrayRead} {
    p_profileMiss;
    t_allocateTBE;
    rd_requestData;
    p_popRequestQueue;
  }

  transition(IV, RdBlk) {
    p_profileMiss;
    t_allocateTBE;
    rd_requestData;
    p_popRequestQueue;
  }

  transition(I, RdBypassEvict) {TagArrayRead} {
    p_profileMiss;
    t_allocateTBE;
    rd_requestData;
    p_popRequestQueue;
  }

// Transition to be called when a read request with SLC flag set arrives at
// entry in state W. It evicts and invalidates the cache entry before
// forwarding the request to global memory
  transition(W, RdBypassEvict, WIB) {TagArrayRead} {
    p_profileMiss;
    t_allocateTBE;
    wb_writeBack;
    i_invL2;
    rd_requestData;
    p_popRequestQueue;
  }

// Transition to be called when a read request with SLC flag set arrives at
// entry in state M. It evicts and invalidates the cache entry before
// forwarding the request to global memory to main memory
  transition(M, RdBypassEvict, WIB) {TagArrayRead} {
    p_profileMiss;
    t_allocateTBE;
    wb_writeBack;
    i_invL2;
    rd_requestData;
    p_popRequestQueue;
  }

// Transition to be called when a read request with SLC flag set arrives at
// entry in state V. It invalidates the cache entry before forwarding the
// request to global memory.
  transition(V, RdBypassEvict, I) {TagArrayRead} {
    p_profileMiss;
    t_allocateTBE;
    i_invL2;
    rd_requestData;
    p_popRequestQueue;
  }

// Transition to be called when a read request with SLC flag arrives at entry
// in transient state. The request stalls until the pending transition is complete.
  transition({WI, WIB, IV}, RdBypassEvict)  {
    st_stallAndWaitRequest;
  }

  transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} {
    p_profileHit;
    wdb_writeDirtyBytes;
    pa_performAtomic;
    ar_sendAtomicResponse;
    p_popRequestQueue;
  }

  transition(I, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileMiss;
    a_allocateBlock;
    ut_updateTag;
    wdb_writeDirtyBytes;
    pa_performAtomic;
    ar_sendAtomicResponse;
    p_popRequestQueue;
  }

  transition(A, Atomic) {
    p_profileMiss;
    // by putting the stalled requests in a buffer, we reduce resource contention
    // since they won't try again every cycle and will instead only try again once
    // woken up
    st_stallAndWaitRequest;
  }

  transition({M, W}, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileHit;
    wdb_writeDirtyBytes;
    pa_performAtomic;
    ar_sendAtomicResponse;
    p_popRequestQueue;
  }

  // The following atomic pass on actions will send the request to the directory,
  // and are triggered when an atomic request is received that is not in TCC,
  // and/or if SLC is set.

  transition(V, AtomicPassOn, A) {TagArrayRead} {
    p_profileHit;
    i_invL2;
    t_allocateTBE;
    at_atomicThrough;
    ina_incrementNumAtomics;
    p_popRequestQueue;
  }

  transition(I, AtomicPassOn, A) {TagArrayRead} {
    p_profileMiss;
    i_invL2;
    t_allocateTBE;
    at_atomicThrough;
    ina_incrementNumAtomics;
    p_popRequestQueue;
  }

  transition(A, AtomicPassOn) {
    p_profileMiss;
    // by putting the stalled requests in a buffer, we reduce resource contention
    // since they won't try again every cycle and will instead only try again once
    // woken up
    st_stallAndWaitRequest;
  }

  transition({M, W}, AtomicPassOn, WI) {TagArrayRead} {
    t_allocateTBE;
    wb_writeBack;
    // after writing back the current line, we need to wait for it to be done
    // before we try to perform the atomic
    // by putting the stalled requests in a buffer, we reduce resource contention
    // since they won't try again every cycle and will instead only try again once
    // woken up
    st_stallAndWaitRequest;
  }

  transition(I, WrVicBlk) {TagArrayRead} {
    p_profileMiss;
    wt_writeThrough;
    p_popRequestQueue;
  }

  transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
    p_profileHit;
    ut_updateTag;
    wdb_writeDirtyBytes;
    wt_writeThrough;
    p_popRequestQueue;
  }

  transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileHit;
    ut_updateTag;
    swb_sendWBAck;
    wdb_writeDirtyBytes;
    p_popRequestQueue;
  }

  transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileHit;
    ut_updateTag;
    swb_sendWBAck;
    wdb_writeDirtyBytes;
    p_popRequestQueue;
  }

  transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileMiss;
    a_allocateBlock;
    ut_updateTag;
    swb_sendWBAck;
    wdb_writeDirtyBytes;
    p_popRequestQueue;
  }

// Transition to be called when a write request with SLC bit set arrives at an
// entry with state V. The entry has to be evicted and invalidated before the
// request is forwarded to global memory
  transition(V, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileMiss;
    ut_updateTag;
    t_allocateTBE;
    wt_writeThrough;
    i_invL2;
    p_popRequestQueue;
  }

// Transition to be called when a write request with SLC bit set arrives at an
// entry with state W. The entry has to be evicted and invalidated before the
// request is forwarded to global memory.
  transition(W, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileMiss;
    ut_updateTag;
    wdb_writeDirtyBytes;
    t_allocateTBE;
    wb_writeBack;
    i_invL2;
    p_popRequestQueue;
  }

  transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
    t_allocateTBE;
    wb_writeBack;
    i_invL2;
  }

  transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
    i_invL2;
  }

  transition({A, IV, WI, WIB}, L2_Repl) {
    i_invL2;
  }

  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
    pi_sendProbeResponseInv;
    pp_popProbeQueue;
  }

  transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
    pi_sendProbeResponseInv;
    pp_popProbeQueue;
  }

  transition(W, PrbInv) {TagArrayRead} {
    pi_sendProbeResponseInv;
    pp_popProbeQueue;
  }

  transition({A, IV, WI, WIB}, PrbInv) {
    pi_sendProbeResponseInv;
    pp_popProbeQueue;
  }

// Transition to be called when the response for a request with SLC bit set
// arrives. The request has to be forwarded to the core that needs it while
// making sure no entry is allocated.
  transition(I, Bypass, I) {
    rb_bypassDone;
    pr_popResponseQueue;
    wada_wakeUpAllDependentsAddr;
    dt_deallocateTBE;
  }

  transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    bar_sendBypassedAtomicResponse;
    dna_decrementNumAtomics;
    pr_popResponseQueue;
  }

  transition(WI, Bypass, I) {
    pr_popResponseQueue;
    wada_wakeUpAllDependentsAddr;
    dt_deallocateTBE;
  }

  transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    a_allocateBlock;
    ut_updateTag;
    wcb_writeCacheBlock;
    sdr_sendDataResponse;
    pr_popResponseQueue;
    wada_wakeUpAllDependentsAddr;
    dt_deallocateTBE;
  }

  transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    a_allocateBlock;
    pa_performAtomic;
    bar_sendBypassedAtomicResponse;
    dna_decrementNumAtomics;
    pr_popResponseQueue;
  }

  transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
    dt_deallocateTBE;
    wada_wakeUpAllDependentsAddr;
    ptr_popTriggerQueue;
  }

  transition(A, AtomicNotDone) {TagArrayRead} {
    dadc_decrementAtomicDoneCnt;
    ptr_popTriggerQueue;
  }

  //M,W should not see WBAck as the cache is in WB mode
  //WBAcks do not need to check tags
  transition({I, V, IV, A}, WBAck) {
    w_sendResponseWBAck;
    pr_popResponseQueue;
  }

  transition(WI, WBAck,I) {
    dt_deallocateTBE;
    wada_wakeUpAllDependentsAddr;
    pr_popResponseQueue;
  }

  transition(WIB, WBAck,I) {
    pr_popResponseQueue;
  }
}