gpu-compute,mem-ruby: Replace ACQUIRE and RELEASE request flags
This patch replaces ACQUIRE and RELEASE flags which are HSA-specific. ACQUIRE flag becomes INV_L1 in VIPER protocol. RELEASE flag is removed. Future protocols may support extra cache coherence flags like INV_L2 and WB_L2. Change-Id: I3d60c9d3625c898f4110a12d81742b6822728533 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32859 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
@@ -232,7 +232,7 @@ GpuWavefront::issueAcquireOp()
|
||||
threadId, nullptr);
|
||||
acq_req->setPaddr(0);
|
||||
acq_req->setReqInstSeqNum(tester->getActionSeqNum());
|
||||
acq_req->setFlags(Request::ACQUIRE);
|
||||
acq_req->setCacheCoherenceFlags(Request::INV_L1);
|
||||
// set protocol-specific flags
|
||||
setExtraRequestFlags(acq_req);
|
||||
|
||||
|
||||
@@ -805,9 +805,9 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
|
||||
// here (simdId=-1, wfSlotId=-1)
|
||||
if (gpuDynInst->isKernelLaunch()) {
|
||||
// for kernel launch, the original request must be both kernel-type
|
||||
// and acquire
|
||||
// and INV_L1
|
||||
assert(pkt->req->isKernel());
|
||||
assert(pkt->req->isAcquire());
|
||||
assert(pkt->req->isInvL1());
|
||||
|
||||
// one D-Cache inv is done, decrement counter
|
||||
dispatcher.updateInvCounter(gpuDynInst->kern_id);
|
||||
@@ -820,16 +820,19 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
|
||||
// retrieve wavefront from inst
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
// Check if we are waiting on Kernel End Release
|
||||
// Check if we are waiting on Kernel End Flush
|
||||
if (w->getStatus() == Wavefront::S_RETURNING
|
||||
&& gpuDynInst->isEndOfKernel()) {
|
||||
// for kernel end, the original request must be both kernel-type
|
||||
// and release
|
||||
// and last-level GPU cache should be flushed if it contains
|
||||
// dirty data. This request may have been quiesced and
|
||||
// immediately responded to if the GL2 is a write-through /
|
||||
// read-only cache.
|
||||
assert(pkt->req->isKernel());
|
||||
assert(pkt->req->isRelease());
|
||||
assert(pkt->req->isGL2CacheFlush());
|
||||
|
||||
// one wb done, decrement counter, and return whether all wbs are
|
||||
// done for the kernel
|
||||
// once flush done, decrement counter, and return whether all
|
||||
// dirty writeback operations are done for the kernel
|
||||
bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
|
||||
|
||||
// not all wbs are done for the kernel, just release pkt
|
||||
@@ -1218,7 +1221,7 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
|
||||
|
||||
if (kernelMemSync) {
|
||||
if (gpuDynInst->isKernelLaunch()) {
|
||||
req->setCacheCoherenceFlags(Request::ACQUIRE);
|
||||
req->setCacheCoherenceFlags(Request::INV_L1);
|
||||
req->setReqInstSeqNum(gpuDynInst->seqNum());
|
||||
req->setFlags(Request::KERNEL);
|
||||
pkt = new Packet(req, MemCmd::MemSyncReq);
|
||||
@@ -1234,11 +1237,12 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
|
||||
|
||||
schedule(mem_req_event, curTick() + req_tick_latency);
|
||||
} else {
|
||||
// kernel end release must be enabled
|
||||
// kernel end flush of GL2 cache may be quiesced by Ruby if the
|
||||
// GL2 is a read-only cache
|
||||
assert(shader->impl_kern_end_rel);
|
||||
assert(gpuDynInst->isEndOfKernel());
|
||||
|
||||
req->setCacheCoherenceFlags(Request::WB_L2);
|
||||
req->setCacheCoherenceFlags(Request::FLUSH_L2);
|
||||
req->setReqInstSeqNum(gpuDynInst->seqNum());
|
||||
req->setFlags(Request::KERNEL);
|
||||
pkt = new Packet(req, MemCmd::MemSyncReq);
|
||||
|
||||
@@ -306,7 +306,7 @@ class GPUDynInst : public GPUExecContext
|
||||
assert(!isEndOfKernel());
|
||||
|
||||
// must be wbinv inst if not kernel launch/end
|
||||
req->setCacheCoherenceFlags(Request::ACQUIRE);
|
||||
req->setCacheCoherenceFlags(Request::INV_L1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -260,30 +260,36 @@ class Request
|
||||
typedef ::Flags<CacheCoherenceFlagsType> CacheCoherenceFlags;
|
||||
|
||||
/**
|
||||
* These bits are used to set the coherence policy
|
||||
* for the GPU and are encoded in the GCN3 instructions.
|
||||
* See the AMD GCN3 ISA Architecture Manual for more
|
||||
* details.
|
||||
* These bits are used to set the coherence policy for the GPU and are
|
||||
* encoded in the GCN3 instructions. The GCN3 ISA defines two cache levels
|
||||
* See the AMD GCN3 ISA Architecture Manual for more details.
|
||||
*
|
||||
* INV_L1: L1 cache invalidation
|
||||
* WB_L2: L2 cache writeback
|
||||
* FLUSH_L2: L2 cache flush
|
||||
*
|
||||
* SLC: System Level Coherent. Accesses are forced to miss in
|
||||
* the L2 cache and are coherent with system memory.
|
||||
* Invalidation means to simply discard all cache contents. This can be
|
||||
* done in the L1 since it is implemented as a write-through cache and
|
||||
* there are other copies elsewhere in the hierarchy.
|
||||
*
|
||||
* GLC: Globally Coherent. Controls how reads and writes are
|
||||
* handled by the L1 cache. Global here referes to the
|
||||
* data being visible globally on the GPU (i.e., visible
|
||||
* to all WGs).
|
||||
* For flush the contents of the cache need to be written back to memory
|
||||
* when dirty and can be discarded otherwise. This operation is more
|
||||
* involved than invalidation and therefore we do not flush caches with
|
||||
* redundant copies of data.
|
||||
*
|
||||
* For atomics, the GLC bit is used to distinguish between
|
||||
* between atomic return/no-return operations.
|
||||
* SLC: System Level Coherent. Accesses are forced to miss in the L2 cache
|
||||
* and are coherent with system memory.
|
||||
*
|
||||
* GLC: Globally Coherent. Controls how reads and writes are handled by
|
||||
* the L1 cache. Global here referes to the data being visible
|
||||
* globally on the GPU (i.e., visible to all WGs).
|
||||
*
|
||||
* For atomics, the GLC bit is used to distinguish between between atomic
|
||||
* return/no-return operations. These flags are used by GPUDynInst.
|
||||
*/
|
||||
enum : CacheCoherenceFlagsType {
|
||||
/** mem_sync_op flags */
|
||||
INV_L1 = 0x00000001,
|
||||
WB_L2 = 0x00000020,
|
||||
/** user-policy flags */
|
||||
FLUSH_L2 = 0x00000020,
|
||||
/** user-policy flags */
|
||||
SLC_BIT = 0x00000080,
|
||||
GLC_BIT = 0x00000100,
|
||||
@@ -938,11 +944,15 @@ class Request
|
||||
/**
|
||||
* Accessor functions for the memory space configuration flags and used by
|
||||
* GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that
|
||||
* these are for testing only; setting extraFlags should be done via
|
||||
* setCacheCoherenceFlags().
|
||||
* setting extraFlags should be done via setCacheCoherenceFlags().
|
||||
*/
|
||||
bool isSLC() const { return _cacheCoherenceFlags.isSet(SLC_BIT); }
|
||||
bool isGLC() const { return _cacheCoherenceFlags.isSet(GLC_BIT); }
|
||||
bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
|
||||
|
||||
bool
|
||||
isGL2CacheFlush() const
|
||||
{
|
||||
return _cacheCoherenceFlags.isSet(FLUSH_L2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor functions to determine whether this request is part of
|
||||
|
||||
@@ -587,7 +587,15 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
assert(pkt->isRead() || pkt->isWrite());
|
||||
|
||||
InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
|
||||
int num_packets = getDynInst(pkt)->exec_mask.count();
|
||||
|
||||
// in the case of protocol tester, there is one packet per sequence
|
||||
// number. The number of packets during simulation depends on the
|
||||
// number of lanes actives for that vmem request (i.e., the popcnt
|
||||
// of the exec_mask.
|
||||
int num_packets = 1;
|
||||
if (!m_usingRubyTester) {
|
||||
num_packets = getDynInst(pkt)->exec_mask.count();
|
||||
}
|
||||
|
||||
// the pkt is temporarily stored in the uncoalesced table until
|
||||
// it's picked for coalescing process later in this cycle or in a
|
||||
|
||||
@@ -70,20 +70,19 @@ RequestStatus
|
||||
VIPERCoalescer::makeRequest(PacketPtr pkt)
|
||||
{
|
||||
// VIPER only supports following memory request types
|
||||
// MemSyncReq & Acquire: TCP cache invalidation
|
||||
// MemSyncReq & INV_L1 : TCP cache invalidation
|
||||
// ReadReq : cache read
|
||||
// WriteReq : cache write
|
||||
// AtomicOp : cache atomic
|
||||
//
|
||||
// VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
|
||||
// does not specify an equivalent type of memory request.
|
||||
// TODO: future patches should rename Acquire and Release
|
||||
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) ||
|
||||
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
|
||||
pkt->cmd == MemCmd::ReadReq ||
|
||||
pkt->cmd == MemCmd::WriteReq ||
|
||||
pkt->isAtomicOp());
|
||||
|
||||
if (pkt->req->isAcquire() && m_cache_inv_pkt) {
|
||||
if (pkt->req->isInvL1() && m_cache_inv_pkt) {
|
||||
// In VIPER protocol, the coalescer is not able to handle two or
|
||||
// more cache invalidation requests at a time. Cache invalidation
|
||||
// requests must be serialized to ensure that all stale data in
|
||||
@@ -94,8 +93,8 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
|
||||
|
||||
GPUCoalescer::makeRequest(pkt);
|
||||
|
||||
if (pkt->req->isAcquire()) {
|
||||
// In VIPER protocol, a compute unit sends a MemSyncReq with Acquire
|
||||
if (pkt->req->isInvL1()) {
|
||||
// In VIPER protocol, a compute unit sends a MemSyncReq with INV_L1
|
||||
// flag to invalidate TCP. Upon receiving a request of this type,
|
||||
// VIPERCoalescer starts a cache walk to invalidate all valid entries
|
||||
// in TCP. The request is completed once all entries are invalidated.
|
||||
@@ -276,7 +275,7 @@ VIPERCoalescer::invTCPCallback(Addr addr)
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate TCP (Acquire)
|
||||
* Invalidate TCP
|
||||
*/
|
||||
void
|
||||
VIPERCoalescer::invTCP()
|
||||
|
||||
Reference in New Issue
Block a user